#!/usr/bin/python # coding: utf-8 #--------------- # Convert a TeX document to and from Unicode, translating known control # sequences to and from Unicode symbols. #--------------- import sys, re, codecs, unicodedata, getopt, os.path # The conversion table {{{1 # The following string contains the translation table. # The syntax of each line is: # Lines not of this form are ignored. # The TeX name is the control sequence without its leading backslash. tex_unicode_table = u""" … dots α alpha β beta γ gamma δ delta ε epsilon ζ zeta η eta θ theta ι iota κ kappa λ lambda μ mu ν nu ξ xi π pi ρ rho σ sigma τ tau υ upsilon φ phi χ chi ψ psi ω omega Γ Gamma Δ Delta Θ Theta Λ Lambda Ξ Xi Π Pi Σ Sigma Φ Phi Ψ Psi Ω Omega ℓ ell ⅋ parr ← leftarrow ↑ uparrow → to ↓ downarrow ↔ leftrightarrow ↕ updownarrow ⇐ Leftarrow ⇒ Rightarrow ⇔ Leftrightarrow ⇚ Lleftarrow ⇛ Rrightarrow ¬ neg × times ∀ forall ∃ exists ∅ emptyset ∈ in ∉ notin ∘ circ ∧ wedge ∨ vee ∩ cap ∪ cup ≃ simeq ≅ cong ≠ neq ≡ equiv ≤ leq ≥ geq ⊂ subset ⊃ supset ⊆ subseteq ⊇ supseteq ⊏ sqsubset ⊐ sqsupset ⊑ sqsubseteq ⊒ sqsupseteq ⊕ oplus ⊗ otimes ⊙ odot ⊢ vdash ⊣ dashv ⊤ top ⊥ bot ⊨ vDash ⊩ Vdash ⊸ multimap ⋅ cdot ♭ flat ♮ natural ♯ sharp ⟦ llbracket ⟧ rrbracket ⟨ langle ⟩ rangle """ # Translating between Unicode code points and TeX names {{{1 class Translator: def __init__ (self): self.warned = {} self.u2t = {} self.t2u = {} self.read_string(tex_unicode_table) def read_string (self, text): lnum = 0 for line in text.split('\n'): lnum = lnum + 1 if len(line) == 0 or line[0] != u'\t': continue line = line.split('\t') if len(line[1]) != 1 or len(line) < 3: sys.stderr.write("invalid data at line %d\n" % lnum) continue self.u2t[line[1]] = line[2] self.t2u[line[2]] = line[1] def has_unicode (self, c): return self.u2t.has_key(c) def to_tex (self, c): if self.u2t.has_key(c): return self.u2t[c] if not warned.has_key(c): warned[c] = None sys.stderr.write("unknown character: %5d %4x %s\n" % (ord(c), ord(c), unicodedata.name(c))) return "" def has_tex (self, s): return self.t2u.has_key(s) def to_unicode (self, s): return self.t2u[s] # Writing TeX code with proper spacing {{{1 class TeXWrite: def __init__ (self, file, enc="latin-1"): self.str = codecs.lookup(enc)[3](file) self.need_space = 0 def write (self, text): if len(text) == 0: return if self.need_space: c = text[0] if (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z') or c == '@': self.str.write(" ") self.need_space = 0 self.str.write(text) def cseq (self, name): if name == "": return self.str.write("\\" + name) self.need_space = 1 # The command line {{{1 tex_enc = "latin-1" action = None in_fname = None out_fname = None def help (): print """\ usage: tex-utf8 [options] [input [output]] -e, --encoding=ENCODING use ENCODING for the non-Unicode side -h, --help display this help and exit -t, --tex translate from Unicode to TeX commands -u, --unicode translate from TeX commands to Unicode\ """ sys.exit(0) opts, args = getopt.getopt(sys.argv[1:], "e:htu", ["encoding=", "help", "tex", "unicode"]) for (opt,arg) in opts: if opt in ("-e", "--encoding"): tex_enc = arg elif opt in ("-h", "--help"): help() elif opt in ("-t", "--tex"): action = "tex" elif opt in ("-u", "--unicode"): action = "unicode" if len(args) >= 1: in_fname = args[0] if len(args) == 2: out_fname = args[1] elif len(args) > 2: sys.stderr.write("too many arguments\n") sys.exit(1) if action is None: sys.stderr.write("no action specified\n") sys.exit(1) trans = Translator() if in_fname is None: in_str = sys.stdin else: in_str = open(in_fname) if out_fname is None: out_str = sys.stdout else: out_str = open(out_fname, "w") # The main program {{{1 if action == "tex": in_str = codecs.lookup("UTF-8")[2](in_str) out_str = TeXWrite(out_str, enc=tex_enc) for line in in_str.readlines(): for c in line: o = ord(c) if o < 256: out_str.write(c) else: out_str.cseq(trans.to_tex(c)) elif action == "unicode": in_str = codecs.lookup(tex_enc)[2](in_str) out_str = codecs.lookup("UTF-8")[3](out_str) re_cseq = re.compile(r"\\(?P[a-zA-Z]+) ?") for line in in_str.readlines(): p = q = 0 m = re_cseq.search(line, q) while m: name = m.group("name") if trans.has_tex(name): out_str.write(line[p:m.start()]) out_str.write(trans.to_unicode(name)) p = q = m.end() else: q = m.end() m = re_cseq.search(line, q) out_str.write(line[p:])