import sys
from pathlib import Path
if len(sys.argv) < 2:
print('Usage: dump_gguf_specials.py <path-to-gguf>')
sys.exit(2)
p = Path(sys.argv[1])
if not p.exists():
print('File not found:', p)
sys.exit(1)
b = p.read_bytes()
try:
s = b.decode('utf-8', errors='replace')
except Exception as e:
s = ''.join(chr(c) if 32 <= c < 127 else '.' for c in b)
keys = ['special_eos_id', 'special_eog_ids', 'special_tokens', 'tokenizer.ggml.tokens']
for k in keys:
idx = s.find(k)
if idx == -1:
print(f"{k}: NOT FOUND")
continue
start = max(0, idx-200)
end = min(len(s), idx+600)
snippet = s[start:end]
print('='*40)
print(f'Key: {k} at byte {idx}')
print(snippet)
print()
# Also attempt to find patterns like 'special_eog_ids arr' and numeric lists
import re
m = re.search(r'special_eog_ids\s*arr\[.*?\]\s*=\s*\[(.*?)\]', s, re.S)
if m:
print('parsed special_eog_ids:', m.group(1).strip())
else:
print('special_eog_ids pattern not parsed')
m2 = re.search(r'special_eos_id\s*u32\s*=\s*(\d+)', s)
if m2:
print('parsed special_eos_id:', m2.group(1))
else:
print('special_eos_id pattern not parsed')