# -*- coding: utf-8 -*- # Author: Franck Sajous # Copyright: CNRS/CLLE-ERSS # Last update: 2018-09-04 # See info and licence at: http://redac.univ-tlse2.fr/lexicons/glawi/tools/ use strict; use warnings; use utf8; use Getopt::Std; our ($opt_f, $opt_l, $opt_H, $opt_p, $opt_w); getopts('f:lp:w:H'); binmode (STDOUT, ":encoding(utf8)"); binmode (STDERR, ":encoding(utf8)"); &usage () if (($#ARGV < 0) || ($#ARGV > 1) || (defined ($opt_f) && defined ($opt_w)) || ((!defined ($opt_f)) && (!defined ($opt_w)))); my $glawiFile = $ARGV[0]; my $outFile = $ARGV[1]; open (GLAWI, "<", $glawiFile) or die ("Unable to read: $glawiFile\n"); binmode (GLAWI, ":encoding(utf8)"); if (defined ($outFile)) { open (OUTFILE, ">", $outFile) or die ("Unable to write: $outFile\n"); binmode (OUTFILE, ":encoding(utf8)"); } # if (defined ($outFile)) my %wordValues; &readWordValuesFile ($opt_f) if (defined ($opt_f)); my ($line, $currentTitle, $currentPOS); my $currentBuffer = ""; my ($inArticle, $inGloss, $inText, $posMatches) = (0, 0, 0, 0); &outputHTMLheader () if (defined ($opt_H)); while ($line = ) { if ($line =~ /
/) { $currentBuffer = $line; } # if ($line =~ /
/) else { if ($line =~ /<\/article>/) { $inArticle = 1; &parseLine ($'); #' } # if ($line =~ /<\/article>/) elsif ($inArticle) { if ($line =~ /<\/article>/) { &parseLine ($`); $inArticle = 0; } # elsif ($line =~ /<\/article>/) else { &parseLine ($line); } # elsif ($line =~ /<\/article>/) else { } # elsif ($inArticle) } # if ($line =~ /
/) else { } # while () close (GLAWI); &outputHTMLfooter () if (defined ($opt_H)); close (OUTFILE) if (defined ($outFile)); sub parseLine () { my $currentLine = shift; if ($currentLine =~ /(.*?)<\/title>/) { $currentTitle = $1; } elsif ($currentLine =~ /<pos .*?type=\"(.*)\"/) { $currentPOS = $1; $posMatches = ((!defined ($opt_p)) || ($currentPOS =~ /$opt_p/)); if ($posMatches) { $posMatches = ((!defined ($opt_l)) || ($currentLine !~ /lemma="0"/)); } # if ($posMatches) } # elsif ($currentLine =~ /<pos .*?type=\"(.*)\"/) elsif ($currentLine =~ /<gloss>/) { my ($l, $r) = ($`, $'); #' &parseLine ($l); $inGloss = 1; &parseLine ($r); } # if ($currentLine =~ /<gloss>/) elsif ($currentLine =~ /<\/gloss>/) { my ($l, $r) = ($`, $'); #' &parseLine ($l); $inGloss = 0; &parseLine ($r); } # elsif ($currentLine =~ /<\/gloss>/) elsif ($posMatches && $inGloss) { if ($currentLine =~ /<txt>/) { my ($l, $r) = ($`, $'); #' &parseLine ($l); $inText = 1; $currentBuffer = ""; &parseLine ($r); } # elsif ($currentLine =~ /<txt[^<>]*>/) elsif ($inText) { if ($currentLine =~ /<\/txt>/) { my ($l, $r) = ($`, $'); #' &parseLine ($l); $inText = 0; my $matches = &testMatches ($currentBuffer); if ($matches ne "") { &output ($matches); } # if ($matches) &parseLine ($r); } # elsif ($currentLine =~ /<\/txt>/) else { $currentBuffer .= $currentLine; } # if ($currentLine =~ /<\/txt>/) else { } # elsif ($inText) } # elsif ($inGloss) } # parseLine () sub testMatches () { my $glossTxt = shift; my $result = ""; if (defined ($opt_w)) { my $regex = "(\\W)(" . $opt_w . ")(\\W)"; if ($glossTxt =~ /$regex/) { my ($l, $m, $r) = ($` . $1, $2, $' . $3); #' $result = &formatOutput ($l, $m, $r); } } # if (defined ($opt_w)) elsif (defined ($opt_f)) { foreach my $wordRegexp (keys (%wordValues)) { my $regex = "(\\W)(" . $wordRegexp . ")(\\W)"; if ($glossTxt =~ /$regex/) { my ($l, $m, $r) = ($` . $1, $2, $3 . $'); #' $result .= &formatOutput ($l, $m, $r); } # if ($glossTxt =~ /$wordRegexp/) } # foreach my $wordRegexp (keys (%wordValues)) } # elsif (defined ($opt_f)) return $result; } # testMatches () sub formatOutput () { my ($l, $m, $r) = @_; my $result; if (defined ($opt_H)) { $result = "<tr><td class='articleTitle'>" . $currentTitle . "</td><td class='left'>" . $l . "</td><td class='match'>" . $m . "</td><td class='right'>" . $r . "</td></tr>\n"; } # if (defined ($opt_H)) else { $result = $currentTitle . ":\t" . $l . "\t" . $m . "\t" . $r . "\n"; } # if (defined ($opt_H)) else { return $result; } # formatOutput () sub readWordValuesFile () { my $filePath = shift; my $line; open (VALUES, "<", $filePath) or die ("Unable to read: $filePath\n"); binmode (VALUES, ":encoding(utf8)"); while ($line = <VALUES>) { chomp ($line); $wordValues{$line} = 1 if ($line !~ /^\s*$/); } # while ($line = <VALUES>) close (VALUES); } # readWordValuesFile () sub output () { my $txtToOutput = shift; if (defined ($outFile)) { print OUTFILE $txtToOutput; } # if (defined ($outFile)) else { print $txtToOutput; } # if (defined ($outFile)) else { } # sub output () sub outputHTMLheader () { &output ("<!DOCTYPE html>\n<html>\n<head>\n"); &output ("<meta charset=\"UTF-8\">\n"); &output ("<title>GLAWI's glosses\n"); &output ("<style type=\"text/css\">\n"); &output (".articleTitle {font-weight: bold; vertical-align: top}\n"); &output (".match {font-weight: bold; color: darkgreen; text-align:center; vertical-align: top}\n"); &output (".left {text-align: right; vertical-align: top}\n"); &output (".right {text-align: left; vertical-align: top}\n"); &output ("</style>\n</head>\n<body>\n"); &output ("<table>\n"); } # outputHTMLheader () sub outputHTMLfooter () { &output ("</table>\n"); &output ("</body>\n"); &output ("</html>\n"); } # outputHTMLheader () sub usage () { print ("Usage: $0 [-p POS] [-l] [-w word] [-f lexiconFile] GLAWIfile [outFile]\n"); print "\tExtracts glosses matching a set of criteria.\n"; print ("\t-p POS: selects only glosses corresponding to a given part of speech (regexp match).\n"); print ("\t-l: lemmas only.\n"); print ("\t-w word: extracts glosses including a given word\n"); print ("\t-f lexiconFile: file containing words to be found in glosses (format: one word perl line, UTF-8 encoded)\n"); print ("\t-H: HTML output.\n"); print ("\tNote: -f and -w options are mutually exclusive. Regexp matches are performed to test word occurences.\n"); die ("\n"); } # usage ()