Newer
Older
TelosDB / .scripts / dump_gguf_specials.py
@楽曲作りまくりおじさん 楽曲作りまくりおじさん 8 hours ago 1 KB chore: prepare for history rewrite
import sys
from pathlib import Path

if len(sys.argv) < 2:
    print('Usage: dump_gguf_specials.py <path-to-gguf>')
    sys.exit(2)

p = Path(sys.argv[1])
if not p.exists():
    print('File not found:', p)
    sys.exit(1)

b = p.read_bytes()
try:
    s = b.decode('utf-8', errors='replace')
except Exception as e:
    s = ''.join(chr(c) if 32 <= c < 127 else '.' for c in b)

keys = ['special_eos_id', 'special_eog_ids', 'special_tokens', 'tokenizer.ggml.tokens']
for k in keys:
    idx = s.find(k)
    if idx == -1:
        print(f"{k}: NOT FOUND")
        continue
    start = max(0, idx-200)
    end = min(len(s), idx+600)
    snippet = s[start:end]
    print('='*40)
    print(f'Key: {k} at byte {idx}')
    print(snippet)
    print()

# Also attempt to find patterns like 'special_eog_ids arr' and numeric lists
import re
m = re.search(r'special_eog_ids\s*arr\[.*?\]\s*=\s*\[(.*?)\]', s, re.S)
if m:
    print('parsed special_eog_ids:', m.group(1).strip())
else:
    print('special_eog_ids pattern not parsed')

m2 = re.search(r'special_eos_id\s*u32\s*=\s*(\d+)', s)
if m2:
    print('parsed special_eos_id:', m2.group(1))
else:
    print('special_eos_id pattern not parsed')