ObsidianDragon/scripts/build_cjk_subset.py

#!/usr/bin/env python3
"""
Build a NotoSansCJK subset font containing all characters used by
the zh, ja, and ko translation files, plus common CJK punctuation
and symbols.

Usage:
    python3 scripts/build_cjk_subset.py

Requires: pip install fonttools brotli
"""
import json
import os
from fontTools.ttLib import TTFont
from fontTools import subset as ftsubset

ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
LANG_DIR = os.path.join(ROOT, 'res', 'lang')
SOURCE_FONT = '/tmp/NotoSansCJKsc-Regular.otf'
OUTPUT_FONT = os.path.join(ROOT, 'res', 'fonts', 'NotoSansCJK-Subset.ttf')

# Collect all characters used in CJK translation files
needed = set()
for lang in ['zh', 'ja', 'ko']:
    path = os.path.join(LANG_DIR, f'{lang}.json')
    if not os.path.exists(path):
        continue
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    for v in data.values():
        if isinstance(v, str):
            for c in v:
                cp = ord(c)
                if cp > 0x7F:  # non-ASCII only (ASCII handled by Ubuntu font)
                    needed.add(cp)

# Also add common CJK ranges that future translations might use:
# - CJK punctuation and symbols (3000-303F)
# - Hiragana (3040-309F)
# - Katakana (30A0-30FF)
# - Bopomofo (3100-312F)
# - CJK quotation marks, brackets
for cp in range(0x3000, 0x3100):
    needed.add(cp)
for cp in range(0x3100, 0x3130):
    needed.add(cp)
# Fullwidth ASCII variants (commonly mixed in CJK text)
for cp in range(0xFF01, 0xFF5F):
    needed.add(cp)

print(f"Total non-ASCII characters to include: {len(needed)}")

# Check which of these the source font supports
font = TTFont(SOURCE_FONT)
cmap = font.getBestCmap()
supportable = needed & set(cmap.keys())
unsupported = needed - set(cmap.keys())

print(f"Supported by source font: {len(supportable)}")
if unsupported:
    print(f"Not in source font (will use fallback): {len(unsupported)}")
    for cp in sorted(unsupported)[:10]:
        print(f"  U+{cp:04X} {chr(cp)}")

# Build the subset using pyftsubset CLI-style API
args = [
    SOURCE_FONT,
    f'--output-file={OUTPUT_FONT}',
    f'--unicodes={",".join(f"U+{cp:04X}" for cp in sorted(supportable))}',
    '--no-hinting',
    '--desubroutinize',
]

ftsubset.main(args)

# Convert CFF outlines to TrueType (glyf) outlines.
# stb_truetype (used by ImGui) doesn't handle CID-keyed CFF fonts properly.
from fontTools.pens.cu2quPen import Cu2QuPen
from fontTools.pens.ttGlyphPen import TTGlyphPen
from fontTools.ttLib import newTable

tmp_otf = OUTPUT_FONT + '.tmp.otf'
os.rename(OUTPUT_FONT, tmp_otf)

conv = TTFont(tmp_otf)
if 'CFF ' in conv:
    print("Converting CFF -> TrueType outlines...")
    glyphOrder = conv.getGlyphOrder()
    glyphSet = conv.getGlyphSet()
    glyf_table = newTable("glyf")
    glyf_table.glyphs = {}
    glyf_table.glyphOrder = glyphOrder
    loca_table = newTable("loca")
    from fontTools.ttLib.tables._g_l_y_f import Glyph as TTGlyph
    for gname in glyphOrder:
        try:
            ttPen = TTGlyphPen(glyphSet)
            cu2quPen = Cu2QuPen(ttPen, max_err=1.0, reverse_direction=True)
            glyphSet[gname].draw(cu2quPen)
            glyf_table.glyphs[gname] = ttPen.glyph()
        except Exception:
            glyf_table.glyphs[gname] = TTGlyph()
    del conv['CFF ']
    if 'VORG' in conv:
        del conv['VORG']
    conv['glyf'] = glyf_table
    conv['loca'] = loca_table
    conv['head'].indexToLocFormat = 1
    if 'maxp' in conv:
        conv['maxp'].version = 0x00010000
    conv.sfntVersion = "\x00\x01\x00\x00"
conv.save(OUTPUT_FONT)
conv.close()
os.remove(tmp_otf)

size = os.path.getsize(OUTPUT_FONT)
print(f"\nOutput: {OUTPUT_FONT}")
print(f"Size: {size / 1024:.0f} KB")

# Verify
verify = TTFont(OUTPUT_FONT)
verify_cmap = set(verify.getBestCmap().keys())
still_missing = needed - verify_cmap
print(f"Verified glyphs in subset: {len(verify_cmap)}")
if still_missing:
    # These are chars not in the source font - expected for some Hangul/Hiragana
    print(f"Not coverable by this font: {len(still_missing)} (need additional font)")
    for cp in sorted(still_missing)[:10]:
        print(f"  U+{cp:04X} {chr(cp)}")
else:
    print("All needed characters are covered!")