use strict;
my $str = ' F001-AB12
0221
0234-024F
02AE-02AF
03AB; 03CB; Case map
03B0; 03C5 0308 0301; Case map
03C2; 03C3; Case map
03B0; 03C5 0308 0301 0A1B; Case map
----- Start Table A.1 -----
03D0; 03B2; Case map
03D1; 03B8; Case map
----- End Table A.1 -----
03D2; 03C5; Additional folding
03D3; 03CD; Additional folding
00DF; 0073 0073; Case map
037B-037D
037F-0383
038B
Hoffman & Blanchet Standards Track [Page 89]
RFC 3454 Preparation of Internationalized Strings December 2002
1806; ; Map to nothing
1806; ; Map to nothing
1806; ; Map to nothing
1806; ; Map to nothing
F0000-FFFFD
100000-10FFFD
F0000
013B; 013C; Case map
013D; 013E; Case map
0080-009F; [CONTROL CHARACTERS]
06DD; ARABIC END OF AYAH
070F; SYRIAC ABBREVIATION MARK
180E; MONGOLIAN VOWEL SEPARATOR
200C; ZERO WIDTH NON-JOINER
200D; ZERO WIDTH JOINER
2028; LINE SEPARATOR
2029; PARAGRAPH SEPARATOR
2060; WORD JOINER
2061; FUNCTION APPLICATION
2062; INVISIBLE TIMES
013F; 0140; Case map
0141; 0142; Case map
0143; 0144; Case map
0145; 0146; Case map
0147; 0148; Case map
0149; 02BC 006E; Case map
014A; 014B; Case map
014C; 014D; Case map
';
my $regex = qr/(?'char'[A-F0-9]{4,7})\;(?'replace'.*)(?:\s{1}(?'replace'[A-F0-9]{4,7}))+(?=\s)+(?:\s{1}(?'replace'[A-F0-9]{4,7}))+(?=\s)+(?:\s{1}(?'replace'[A-F0-9]{4,7}))+\;\ (?'comment'.*)(?:\n)|
(?'char'[A-F0-9]{4,7})\;(?'replace'.*)(?:\s{1}(?'replace'[A-F0-9]{4,7}))+(?=\s)+(?:\s{1}(?'replace'[A-F0-9]{4,7}))+\;\ (?'comment'.*)(?:\n)|
(?'char'[A-F0-9]{4,7})\;(?:\s{1}(?'replace'[A-F0-9]{4,7}))+(?=\s)+(?:\s{1}(?'replace'[A-F0-9]{4,7}))+\;\ (?'comment'.*)(?:\n)|
(?'char'[A-F0-9]{4,7})\;(?'replace'.*)\;\ (?'comment'.*)(?:\n)|
(?'range_start'[A-F0-9]{4,7})\-(?'range_end'[A-F0-9]{4,7})(?:\;\ )(?'comment'.*)(?:\n)|
(?'char'[A-F0-9]{4,7})(?:\;\ )(?'comment'.*)(?:\n)|
(?'range_start'[A-F0-9]{4,7})\-(?'range_end'[A-F0-9]{4,7})(?:\n)|
(?:^\ {3})(?'char'[A-F0-9]{4,7})(?:\n)|
(?<appendix>(?'appendix_type'Start|End)\sTable\s(?'appendix_number'(\w).(?'appendix_order'(\d)))(?=\ -----\n))/xmp;
if ( $str =~ /$regex/g ) {
print "Whole match is ${^MATCH} and its start/end positions can be obtained via \$-[0] and \$+[0]\n";
# print "Capture Group 1 is $1 and its start/end positions can be obtained via \$-[1] and \$+[1]\n";
# print "Capture Group 2 is $2 ... and so on\n";
}
# ${^POSTMATCH} and ${^PREMATCH} are also available with the use of '/p'
# Named capture groups can be called via $+{name}
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Perl, please visit: http://perldoc.perl.org/perlre.html