-- -*- coding: utf-8 -*-
Copyright 2014, 2020 Stephan Hennig
This file is part of Padrinoma.
Padrinoma is free software: you can redistribute it and/or modify it
under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Padrinoma is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public
License along with Padrinoma. If not, see
-- -- luadoc -d API *.lua -- ---- Table for module identification. local module = { name = 'pdnm_nl_manipulation', date = '2014/01/22', version = '0.1', description = 'pattern driven node list manipulation', author = 'Stephan Hennig', licence = 'GNU AGPL ver. 3', } -- Write module identification to the log file. luatexbase.provides_module(module) -- Load third-party modules. local unicode = require('unicode') local cls_spot = require('autotype-cls_pdnm_spot') -- -- @trick Prevent LuaDoc from looking past here for module description. --[[ Trick LuaDoc into entering 'module' mode without using that command. module(...) --]] -- Local module table. local M = {} -- Short-cuts. local Ntraverse = node.traverse local Tconcat = table.concat local Tinsert = table.insert local Tremove = table.remove local Tsort = table.sort local TEXgetlccode = tex.getlccode local Uchar = unicode.utf8.char local Ugsub = unicode.utf8.gsub -- Short-cuts for constants. local DISC = node.id('disc') local GLYPH = node.id('glyph') local WHATSIT = node.id('whatsit') local USER_DEFINED = node.subtype('user_defined') -- Local references to terminal output functions. local info = luatexbase.module_info local warn = luatexbase.module_warning --- Word property table. -- @name word property table -- @class table -- @field nodes -- A sequence of glyph nodes corresponding to the letters of the word. -- Note, there may be additional nodes between two glyph nodes in the -- original node list. That is, the assumption
nodes[i].next ==
-- nodes[i+1]
doesn't hold generally.\exhyphenchar
. TeX normally doesn't insert
-- discretionaries in words containing explicit hyphens. Looking at
-- this field, one can imitate that behaviour. If a word contains no
-- explicit hyphens, this field is set to the value `nil` instead of an
-- empty table.
-- @field parents
-- A table; the element at index i refers to the node at index i in
-- table `nodes` and is either `nil` or a stack (a table) containing
-- references to parent nodes. Parent nodes are either discretionary
-- nodes or glyph nodes, the latter corresponding to an automatic
-- ligature (the glyph node is part of a node list from a `components`
-- field). An application of this table is refraining from applying
-- manipulations to nodes that are not top-level glyph nodes, i.e., when
-- the value in this table is non-`nil`. If a word contains only
-- top-level glyph nodes, this field is set to the value `nil` instead
-- of an empty table.
-- @field levels
-- A sequence of levels resulting from matching the given spot object
-- against the letters of the word. The level at index i in this table
-- refers to the position between nodes at indices i-1 and i in the
-- other tables (at word boundaries, only one of those two nodes
-- actually exists). Since a word with n letters has n+1 legal level
-- positions, this table is one item longer than the other two tables.
-- Odd values refer to valid spots found in the word.
-- Module-wide variables.
-- A sequence of closures. At the end of the LuaTeX run these closures
-- are called and write debugging information to a file.
local debug_information
--- Create a custom function that processes a node list.
-- Argument of the returned function is a node list. The node list is
-- scanned for words in the given language. All words found are matched
-- against the given patterns. Return value is a sequence of word
-- property tables.
-- @param language A language identifier or number patterns are
-- associated with.
-- @param pattern_name Name of a pure text UTF-8 pattern file.
-- @param spot_leading Leading spot min when applying patterns to words.
-- If the argument is negative, leading spot min is determined
-- automatically as the left hyphen min value of the last glyph node of
-- a word.
-- @param spot_trailing Trailing spot min when applying patterns to
-- words. See parameter spot_leading
for more details.
-- @param is_debug If this flag is true
, at the end of the
-- TeX run, a list of words with spots according to the given patterns
-- is written to a file for debugging purposes. By default, debugging
-- is inactive.
-- @return Custom node list scanner funtion.
-- @see match_patterns
local function create_node_list_scanner(language, pattern_name, spot_leading, spot_trailing, is_debug)
-- Upvalues used while matching patterns against the words in a node
-- list.
-- Spot object used for pattern matching.
local spot
-- Pre-process spot minima. If arguments are negative (automatic
-- mode), make variables nil so that later flag evaluation can be
-- simplified.
if spot_leading < 0 then spot_leading = nil end
if spot_trailing < 0 then spot_trailing = nil end
-- Language number associated with spot object.
local language_num
-- Table of words with spots as strings (for debugging).
local words_with_spots = {}
-- A sequence of word property tables of the words found in the node
-- list.
local words
-- This table corresponds to field `nodes` in a word property table.
local word_nodes
-- This table corresponds to field `exhyphenchars` in a word property
-- table.
local word_exhyphenchars
-- This table corresponds to field `parents` in a word property
-- table.
local word_parents
-- Current stack of parent nodes. Table `word_parents` contains
-- copies of this table.
local parentstack
-- Flag.
local is_within_word
--- Initialize a new current word.
-- Prepare a new word decomposition.
local function new_current_word()
-- Flag current word mode.
is_within_word = true
-- Initialize some upvalues.
word_nodes = {}
word_exhyphenchars = nil
word_parents = {}
parentstack = {}
-- Prepare new word decomposition.
-- Process leading boundary letter.
--- Finish the current word.
-- Finish word decomposition. Calculate spot positions for the word
-- and store word properties in a property table.
local function finish_current_word()
-- Reset current word flag.
is_within_word = false
-- Ignore empty words, because we cannot access their nodes.
if #word_nodes == 0 then
-- Last node of word.
local last_glyph = word_nodes[#word_nodes]
-- Is this a word of the current pattern language?
if last_glyph.lang ~= language_num then
-- Adjust spot mins. Must be done before finishing decomposition.
-- Variables spot_leading and spot_trailing contain a non-negative
-- number or nil.
local leading = spot_leading or last_glyph.left
local trailing = spot_trailing or last_glyph.right
spot:set_spot_mins(leading, trailing)
-- Process trailing boundary letter.
-- Finish decomposition.
-- Insert processed word into word table.
Tinsert(words, {
nodes = word_nodes,
exhyphenchars = word_exhyphenchars,
parents = word_parents,
levels = spot.word_levels,
-- Debug spots?
if is_debug then
local chars = {}
for _, n in ipairs(word_nodes) do
Tinsert(chars, Uchar(n.char))
words_with_spots[Tconcat(spot:to_word_with_spots(chars, spot.word_levels))] = true
--- Scan a node list for words.
-- Collects all words subject to pattern matching in the node list.
-- Match a spot object against the letters (glyph nodes) of the words
-- and store results in property tables.\hbox
aren't hyphenated by TeX and changes in the
-- language imply word boundaries. Here's a link to the
-- relevant LuaTeX C source code. See this
-- mail on the tex-hyphen list.
-- @param head Node list.
-- @return Upvalues.
local function do_scan_node_list(head)
for n in Ntraverse(head) do
local nid = n.id
if nid == GLYPH then
-- Automatic ligature or fundamental glyph?
local components = n.components
if components then
-- Automatic ligature.
-- New word?
if not is_within_word then
-- Update parent node stack and recurse into component
-- node list.
Tinsert(parentstack, n)
-- Fundamental glyph.
local lc = TEXgetlccode(n.char)
if lc > 0 then
-- Letter glyph.
-- New word?
if not is_within_word then
-- Add node to table.
Tinsert(word_nodes, n)
if n.char == tex.exhyphenchar then
word_exhyphenchars = word_exhyphenchars or {}
Tinsert(word_exhyphenchars, #word_nodes)
-- Advance decomposition.
-- Add copy of current parent node stack to table.
local stack_copy
if #parentstack > 0 then
stack_copy = {}
for i,parent in ipairs(parentstack) do
stack_copy[i] = parent
word_parents[#word_nodes] = stack_copy
-- Non-letter glyph.
if is_within_word then
elseif nid == DISC then
if not is_within_word then
-- Does the discretionary contain components belonging to a
-- non-hyphenated word?
local replace = n.replace
if replace then
-- Update parent node stack and recurse into replacment
-- node list.
Tinsert(parentstack, n)
elseif (nid == WHATSIT and nsubtype == USER_DEFINED)
-- Ignore node. Don't change state.
-- Non-word node.
if is_within_word then
--- Set-up a new node list scan.
-- @param head A node list.
-- @return A sequence of word property tables.
-- @see do_scan_node_list
local function scan_node_list(head)
-- Initialize upvalues.
words = {}
is_within_word = false
-- Determine language number associated with spot object.
-- The language number cannot be determined when the Lua module is
-- loaded, because languages may be loaded thereafter with polyglossia's
-- language loading commands \setmainlanguage and \setotherlanguage. So
-- we determine the language number within this function, which is not
-- executed before \begin{document}.
if not language_num then
if type(language) == 'string' then
language_num = luatexbase.registernumber('l@'..language)
language_num = language
if type(language_num) ~= 'number' then
warn(module.name, 'Cannot determine language number for argument \'' .. tostring(language) .. '\'')
-- Process list.
-- Post-process last word.
if is_within_word then
-- Remove unneeded references in upvalues.
spot.word_levels = nil
word_nodes = nil
word_exhyphenchars = nil
word_parents = nil
parent_stack = nil
local twords = words
words = nil
return twords
--- Write debugging information associated to the node list scanner.
-- If the debug flag is true
, write debugging
-- information at the end of the LuaTeX run to a file. Debugging
-- information consists of a list of all words handled by the node
-- list scanner with hyphens inserted at spot positions. File name
-- is the pattern file name augmented by the extension
-- .spots
local function write_debug_information()
if is_debug then
-- Sort words.
local a = {}
for k,_ in pairs(words_with_spots) do
Tinsert(a, k)
-- Remove all path information from pattern file name.
local plain_pattern_name = Ugsub(pattern_name, '^.*/', '')
-- Write words to file.
local fout = assert(io.open(plain_pattern_name .. '.spots', 'w'))
for _,v in ipairs(a) do
fout:write(v, '\n')
-- Initialize custom spot object.
spot = cls_spot:new()
assert(pattern_name and pattern_name ~= '', 'Bad pattern file name: ' .. pattern_name)
local fin = assert(kpse.find_file(pattern_name), 'Could not find pattern file ' .. pattern_name)
fin = assert(io.open(fin, 'r'), 'Could not open pattern file ' .. pattern_name)
local count = spot:read_file(fin)
info(module.name, count .. ' patterns read from file ' .. pattern_name)
-- Store custom function writing debug information to a file.
Tinsert(debug_information, write_debug_information)
-- Return custom pattern matching function.
return scan_node_list
M.create_node_list_scanner = create_node_list_scanner
--- (internal) Write debugging information.
-- Write all words handled by a manipulation to a file. File name is
-- the pattern file name augmented by the extension .spots
local function __cb_write_debug_information()
for _, write_debug_information in ipairs(debug_information) do
--- Module initialization.
local function __init()
-- Initialize debugging information table.
debug_information = {}
-- Register stop run call-back for spot debugging output.
luatexbase.add_to_callback('stop_run', __cb_write_debug_information, 'pdnm_write_debug_information')
return M