#!/usr/bin/perl use warnings; use Getopt::Std; $opt_l = 'kKsSvVzZoOuUiIA'; # defaults for Slovak and Czech $opt_c = $opt_f = $opt_h = $opt_v = 0; getopts('cfphvl:'); $version_msg = "findhyph version 3.4\n"; $help_msg = "\nUsage: findhyph [options] \n" . "\n" . "Options: -c display hyphenated words in context\n" . " -f display font selectors\n" . " -h display this help message\n" . " -v display version\n" . " -p generate file with prepositions left at the end of line\n" . " -l=STRING use prepositions/conjunctions listed in STRING\n"; print ($version_msg) if $opt_v; die ($help_msg) if $opt_h || !$ARGV[0]; $filename = $ARGV[0]; if ($filename =~ /\.(log|tex|dvi|ps|pdf)$/) { $filename = $`; } open(IN, "$filename.log") or die ("Can't read $filename.log: $!\n"); open(O1, ">$filename.hyph") or die ("Can't write $filename.hyph: $!\n"); if ($opt_p) { open(O2, ">$filename.prep") or die ("Can't write $filename.prep: $!\n"); } $search = 0; $pageno_allowed = 0; $write_hyph_page = 0; $write_prep_page = 0; $act_text = ""; $looseness = 0; while() { s/[\r\n]*$//; if (/^\@looseness=(\-?\d+)$/) { $looseness = $1; } if (/^\@firstpass$/) { $pageno_allowed = 0; $endgraf = 0; $demerits = 1e10; $search = 1; $search_hyphens = 0; $act_break = 0; $act_text = ""; @USED = (); %BREAKS = (); %LOOSE = (); next; } if (/^\@secondpass$/) { $search_hyphens = 1; $act_break = 0; $act_text = ""; @USED = (); %BREAKS = (); %LOOSE = (); next; } if (/^\@emergencypass$/) { $act_break = 0; $act_text = ""; @USED = (); %BREAKS = (); %LOOSE = (); next; } if ($search && /^@@(\d+): line (\d+)\..+ t=(\-?\d+) (?:s=\S+ (?:g|a)=\S+ )?\-\> @@(\d+)/) { $act_break = $1; $BREAKS{$act_break}{'prev'} = $4; if ($endgraf && $3 < $demerits) { $max_break = $1; $max_line = $2; $demerits = $3; } if ($endgraf && $looseness != 0) { $LOOSE{$1}{"line"} = $2; $LOOSE{$1}{"demerits"} = $3; } $act_text .= "\@$act_break\@"; next; } if ($search && /^@\\par via @@/) { $endgraf = 1; next; } if ($search && /^@\\discretionary via @@/) { $search_hyphens = 1; # @firstpass may contain words hyphenated by user next; # using \discretionary } if ($search && /^@/) { next; } if ($search && $endgraf && $_ eq "") { $pageno_allowed = 1; do_hyph(); $search = 0; $looseness = 0; next; } if ($search) { $act_text .= $_; } else { if ($pageno_allowed && /\[(\-?\d+)(\.\-?\d+){0,9} ?(\]|\{|\<|$)/) { # page number $pageno = $1; $pageno_allowed = 0; if ($write_hyph_page) { print O1 "[$pageno]\n\n"; $write_hyph_page = 0; } if ($write_prep_page) { print O2 "[$pageno]\n\n"; $write_prep_page = 0; } } } } close(O1); close(O2) if $opt_p; sub do_hyph { if ($looseness != 0) { $diff_line = 1e10; $diff_demer = 1e10; for $i (keys %LOOSE) { if (abs($LOOSE{$i}{"line"} - $max_line - $looseness) < $diff_line) { $diff_line = abs($LOOSE{$i}{"line"} - $max_line - $looseness); $diff_demer = $LOOSE{$i}{"demerits"}; $max_break = $i; } if (abs($LOOSE{$i}{"line"} - $max_line - $looseness) == $diff_line && $LOOSE{$i}{"demerits"} < $diff_demer) { $diff_demer = $LOOSE{$i}{"demerits"}; $max_break = $i; } } } $br = $max_break; while (1) { $br = $BREAKS{$br}{'prev'}; last if ($br == 0); $USED[$br] = 1; } for $i (1..$max_break-1) { if (not defined $USED[$i]) { $act_text =~ s/\@$i\@//; }; } if ($opt_f) { $act_text =~ s/(\\\S+)\s+/$1\_/g; } else { $act_text =~ s/\\\S+\s+//g; # usually font selector, but also backslash in the text } for $i (1..$max_break-1) { if (defined $USED[$i]) { if ($search_hyphens) { # hyphenated words if ($act_text =~ /(\S+)(\@$i\@)(\S+)/) { # only hyphenated words match $out_pre = $1; $out_post = $3; if (substr($out_pre,-1) ne "-") { # hyphenated ligatures $out_pre_A = substr($out_pre,0,rindex($out_pre,"-")); $out_pre_B = substr($out_pre,rindex($out_pre,"-")+1); $out_pre = $out_pre_A; $out_post = $out_pre_B.$out_post; } if ($opt_c) { if ($act_text =~ /(\S+)\s+\S+\@$i\@/) {$out_pre = "$1 $out_pre";} if ($act_text =~ /\@$i\@\S+\s+(\S+)/) {$out_post = "$out_post $1";} } $out_pre =~ s/-//g; $out_post =~ s/-//g; $out_text = "$out_pre-$out_post"; $out_text =~ s/\@\d+\@//g; # very narrow columns print O1 "$out_text\n"; $write_hyph_page = 1; } } if ($opt_p) { # prepositions if ($act_text =~ / (\S) \@$i\@/) { $out_text = $1; if ($out_text =~ /^[$opt_l]$/) { if ($opt_c) { if($act_text =~ /(\S+) \S \@$i\@/) {$out_text = "$1 $out_text";} if($act_text =~ /\@$i\@ ?(\S+)/) {$out_text = "$out_text $1";} $out_text =~ s/-//g; $out_text =~ s/\@\d+\@//g; # very narrow columns } print O2 "$out_text\n"; $write_prep_page = 1; } } } } } } __END__ =encoding utf8 =head1 NAME B – find words hyphenated by TeX in a document =head1 INSTALLATION Copy B or B (depending on OS used) to a directory included in system PATH. Perl interpreter is required to be in C for Unix-like systems or in PATH when using B. =head1 SYNOPSIS B F =head1 DESCRIPTION To use this program: =over 4 =item 1) set C<\tracingparagraphs=1> in a TeX document F and run: =item 2) B F =item 3) B F =back If you are setting C<\looseness=ENE> to optimize paragraphs, you need to pass that information to the file F in a form C<@looseness=ENE> (on a separate line). You can use a macro like C<\def\setlooseness#1{\looseness=#1 \immediate\write-1{@looseness=#1}}> for this purpose. If the paragraph contains material in display math mode, you need to use the macro in all split parts of the paragraph. =head1 OPTIONS =over 4 =item B<-c> display hyphenated words in context =item B<-f> display font selectors and other strings starting with a backslash character =item B<-v> display program version =item B<-p> generate file containing information about one-letter prepositions and conjunctions left at the end of line =item B<-l=STRING> use prepositions/conjunctions listed in STRING instead of default list of prepositions and conjunctions C used for Slovak and Czech language =back =head1 OUTPUT FILES =over 4 =item F List of hyphenated words. All punctuation characters, parentheses and other character immediately preceding or following displayed words are included in this list. TeX constructs which are too difficult to display (C<\hbox{}>, C<\mark{}> etc.) are shown as C<[]>. Math mode is indicated by C<$> sign. Page numbers in square brackets refer to LOG file and may occasionally differ from the typeset document. The reason is that TeX may need to break more paragraphs than it would eventually fit on the page in order to find a page break. Words hyphenated in footnotes are listed before the words hyphenated in the paragraph in which the footnote is referenced. =item F List of prepositions if option B<-p> is used. =back =head1 HISTORY =head4 1.0 (2001-04-08) =over 4 =item * public release =back =head4 2.0 (2009-08-10) =over 4 =item * fixes in line breaks detection algorithm; support for the third pass of line breaking algorithm in TeX (positive C<\emergencystretch>); support for discretionary breaks in the first pass =item * page number detection improved (recognized negative page numbers, compound page numbers when C<\count1> to C<\count9> registers are non-zero and C<[nn{mapfile}]>, C<[nnEpictureE]> and C<[nnEnewlineE> formats used by pdfTeX; false page number detection should be much more rare) =item * configurable list of prepositions and conjunctions =item * hyphenated words can be displayed in context =item * suggestions and testing by Pavel Stříž =back =head4 3.0 (2012-02-01) =over 4 =item * fixed a bug when total paragraph demerits are negative =item * C<\looseness> setting is now taken into account (thanks to Karel Horák for identifying the issue) =back =head4 3.1 (2012-11-03) =over 4 =item * fixed displaying of hyphenated ligatures (bug reported by Karel Horák) =back =head4 3.2 (2012-11-21) =over 4 =item * improved example definition of the C<\setlooseness> macro (thanks to Karl Berry) =back =head4 3.3 (2013-06-18) =over 4 =item * fixed the manual page as suggested by Eric S. Raymond =back =head4 3.4 (2015-10-18) =over 4 =item * fixed processing of logs produced using ε-TeX's C<\lastlinefit> option (bug reported by Karel Horák) =item * fixed cross-platform handling of linebreaks in the input file =back =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or any later version. =head1 AUTHOR Copyright (c) Martin Budaj Em.b@speleo.skE 2000, 2001, 2009, 2012, 2015 =cut