--- spelling-stage-1.lua --- Copyright 2012, 2013 Stephan Hennig -- -- This work may be distributed and/or modified under the conditions of -- the LaTeX Project Public License, either version 1.3 of this license -- or (at your option) any later version. The latest version of this -- license is in http://www.latex-project.org/lppl.txt -- and version 1.3 or later is part of all distributions of LaTeX -- version 2005/12/01 or later. -- -- See file README for more information. -- --- Handle lists of bad and good strings and match rules. -- -- @author Stephan Hennig -- @copyright 2012, 2013 Stephan Hennig -- @release version 0.41 -- -- @trick Prevent LuaDoc from looking past here for module description. --[[ Trick LuaDoc into entering 'module' mode without using that command. module(...) --]] -- Module table. local M = {} -- Import external modules. local unicode = require('unicode') local xml = require('luaxml-mod-xml') -- Function short-cuts. local Sfind = string.find local tabinsert = table.insert local Ufind = unicode.utf8.find local Ugmatch = unicode.utf8.gmatch local Usub = unicode.utf8.sub -- Declare local variables to store references to resources that are -- provided by external code. -- -- Table of known bad strings. local __is_bad -- -- Table of known good strings. local __is_good -- -- Table of bad rules. local __rules_bad -- -- Table of good rules. local __rules_good --- Generic function for reading bad or good spellings from a file. -- All data from the file is read into a string, which is then parsed by -- the given parse function. -- -- @param fname File name. -- @param parse_string Custom parse function. -- @param t Mapping table bad or good spellings should be added to. -- @param hint String for info message. Either `bad` or `good`. local function __parse_file(fname, parse_string, t, hint) local total_c = 0 local new_c = 0 local f, err = io.open(fname, 'r') if f then local s = f:read('*all') f:close() total_c, new_c = parse_string(s, t) else texio.write_nl('package spelling: Warning! ' .. err) end texio.write_nl('package spelling: Info! ' .. total_c .. '/' .. new_c .. ' total/new ' .. hint .. ' strings read from file \'' .. fname .. '\'.') end --- Generic function for parsing a string containing a plain list of -- strings. Input format are strings separated by new line or carriage -- return, i.e., one string per line. All lines found in the list are -- mapped to the boolean value `true` in the given table. -- -- @param s Input string (a list of strings). -- @param t Table that maps strings to the value `true`. -- @return Number of total and new strings found. local function __parse_plain_list(s, t) local total_c = 0 local new_c = 0 -- Iterate line-wise through input string. for l in Ugmatch(s, '[^\r\n]+') do -- Map string to boolean value `true`. if not t[l] then t[l] = true new_c = new_c + 1 end total_c = total_c + 1 end return total_c, new_c end --- Parse a plain list of bad strings read from a file. -- All strings found (words with known incorrect spelling) are mapped to -- the boolean value `true` in table `__is_bad`. The format of the -- input file is one string per line. -- -- @param fname File name. local function parse_bad_plain_list_file(fname) __parse_file(fname, __parse_plain_list, __is_bad, 'bad') end M.parse_bad_plain_list_file = parse_bad_plain_list_file --- Parse a plain list of good strings read from a file. -- All strings found (words with known correct spelling) are mapped to -- the boolean value `true` in table `__is_good`. The format of the -- input file is one string per line. -- -- @param fname File name. local function parse_good_plain_list_file(fname) __parse_file(fname, __parse_plain_list, __is_good, 'good') end M.parse_good_plain_list_file = parse_good_plain_list_file --- Get a custom LanguageTool XML handler. -- The returned XML handler scans LanguageTool XML data for incorrect -- spellings. For every incorrect spelling found, the given call-back -- function is called with the incorrect spelling string as argument.
-- -- XML data is checked for being created by LanguageTool (via attribute -- software in tag matches). -- -- @param cb Call-back function handling incorrect spellings found in -- XML data. -- @return XML handler. local function __get_XML_handler_LanguageTool(cb) -- Some flags for checking validity of XML data. LanguageTool XML -- data must declare as being UTF-8 encoded and advertise as being -- created by LanguageTool. local is_XML_encoding_UTF_8 = false local is_XML_creator_LanguageTool = false local is_XML_valid = false --- Handler object for parsing LanguageTool XML data. -- This table contains call-backs used by LuaXML when parsing XML -- data. -- -- @class table -- @name XML_handler -- @field decl Handle XML declaration. -- @field starttag Handle all relevant tags. -- @field endtag Not used, but mandatory. local XML_handler = { decl = function(self, text, attr) -- Check XML encoding declaration. if attr.encoding == 'UTF-8' then is_XML_encoding_UTF_8 = true is_XML_valid = is_XML_encoding_UTF_8 and is_XML_creator_LanguageTool else error('package spelling: Error! XML data not in the UTF-8 encoding.') end end, starttag = function(self, text, attr) -- Process tag. if text == 'matches' then -- Check XML creator is LanguageTool. if attr and attr.software == 'LanguageTool' then is_XML_creator_LanguageTool = true is_XML_valid = is_XML_encoding_UTF_8 and is_XML_creator_LanguageTool end -- Check XML data is valid. elseif not is_XML_valid then error('package spelling: Error! No valid LanguageTool XML data.') -- Process tags. elseif text == 'error' then local ruleid = attr.ruleid if ruleid == 'HUNSPELL_RULE' or ruleid == 'HUNSPELL_NO_SUGGEST_RULE' or ruleid == 'GERMAN_SPELLER_RULE' or Ufind(ruleid, '^MORFOLOGIK_RULE_') then -- Extract misspelled word from context attribute. local word = Usub(attr.context, attr.contextoffset + 1, attr.contextoffset + attr.errorlength) cb(word) end end end, endtag = function(self, text) end, } return XML_handler end --- Parse a string containing LanguageTool XML data. -- All incorrect spellings found in the given XML data are mapped to the -- boolean value `true` in the given table. -- -- @param s String containing XML data. -- @param t Table mapping incorrect spellings to a boolean. -- @return Number of total and new incorrect spellings found. local function __parse_XML_LanguageTool(s, t) local total_c = 0 local new_c = 0 -- Create call-back for custom LanguageTool XML handler that stores a -- bad word in the given table and does some statistics. local cb_incorrect_spelling = function(word) if not t[word] then t[word] = true new_c = new_c + 1 end total_c = total_c + 1 end -- Create custom XML handler. local XML_handler_LT = __get_XML_handler_LanguageTool(cb_incorrect_spelling) -- Create custom XML parser. local x = xml.xmlParser(XML_handler_LT) -- Parse XML data. x:parse(s) return total_c, new_c end --- Parse LanguageTool XML data read from a file. -- All strings found in the file (words with known incorrect spelling) -- are mapped to the boolean value `true` in table `__is_bad`. -- -- @param fname File name. local function parse_XML_LanguageTool_file(fname) __parse_file(fname, __parse_XML_LanguageTool, __is_bad, 'bad') end M.parse_XML_LanguageTool_file = parse_XML_LanguageTool_file --- Parse default sources for bad and good strings. -- All strings found in default sources for words with known incorrect -- spelling are mapped to the boolean value `true` in table `__is_bad`. -- All strings found in default sources for words with known correct -- spelling are mapped to the boolean value `true` in table `__is_good`. -- Default sources for bad spellings are files `.spell.xml` (a -- LanguageTool XML file) and `.spell.bad` (a plain list file). -- Default sources for good spellings are file `.spell.good` (a -- plain list file). local function parse_default_bad_and_good() local fname, f -- Try to read bad spellings from LanguageTool XML file -- '.spell.xml'. fname = tex.jobname .. '.spell.xml' f = io.open(fname, 'r') if f then f:close() parse_XML_LanguageTool_file(fname) end -- Try to read bad spellings from plain list file -- '.spell.bad'. fname = tex.jobname .. '.spell.bad' f = io.open(fname, 'r') if f then f:close() parse_bad_plain_list_file(fname) end -- Try to read good spellings from plain list file -- '.spell.good'. fname = tex.jobname .. '.spell.good' f = io.open(fname, 'r') if f then f:close() parse_good_plain_list_file(fname) end end M.parse_default_bad_and_good = parse_default_bad_and_good --- Default bad dictionary look-up match rule. -- This function looks-up both arguments in the list of bad spellings. -- It returns `true` if either of the arguments is found in the list of -- bad spellings, otherwise `false`. -- -- @param raw Raw string to check. -- @param stripped Same as `raw`, but with stripped surrounding -- punctuation. -- @return A boolean value indicating a match. local function __bad_rule_bad_dictionary_lookup(raw, stripped) return __is_bad[stripped] or __is_bad[raw] end --- Default good dictionary look-up match rule. -- This function looks-up both arguments in the list of good spellings. -- It returns `true` if either of the arguments is found in the list of -- good spellings, otherwise `false`. -- -- @param raw Raw string to check. -- @param stripped Same as `raw`, but with stripped surrounding -- punctuation. -- @return A boolean value indicating a match. local function __good_rule_good_dictionary_lookup(raw, stripped) return __is_good[stripped] or __is_good[raw] end --- Load match rule module. -- Match rule modules are loaded using `require`. The module table must -- follow the following convention: Indentifiers of bad match rules -- start `bad_rule_`. Indentifiers of good match rules start -- `good_rule_`. Other and non-function identifiers are ignore. -- -- All match rules found in a module are added to the table of bad and -- good match rules. Arguments of a match rule function are a raw -- string and the same string with stripped surrounding punctuation. -- -- @param fname Module file name. local function read_match_rules(fname) local bad_c = 0 local good_c = 0 local rules = require(fname) for k,v in pairs(rules) do if type(v) == 'function' then if Sfind(k, '^bad_rule_') then tabinsert(__rules_bad, v) bad_c = bad_c + 1 elseif Sfind(k, '^good_rule_') then tabinsert(__rules_good, v) good_c = good_c + 1 end end end texio.write_nl('package spelling: Info! ' .. bad_c .. '/' .. good_c .. ' bad/good match rules read from module \'' .. fname .. '\'.') end M.read_match_rules = read_match_rules --- Module initialisation. -- local function __init() -- Get local references to package ressources. __rules_bad = PKG_spelling.res.rules_bad __rules_good = PKG_spelling.res.rules_good -- Add default dictionary look-up match rules. tabinsert(__rules_bad, __bad_rule_bad_dictionary_lookup) tabinsert(__rules_good, __good_rule_good_dictionary_lookup) -- Create emtpy lists of known spellings. __is_bad = {} __is_good = {} end -- Initialize module. __init() -- Return module table. return M