diff --git "a/journals/20260219-0005-Vibrato\345\275\242\346\205\213\347\264\240\350\247\243\346\236\220\345\276\251\346\227\247.md" "b/journals/20260219-0005-Vibrato\345\275\242\346\205\213\347\264\240\350\247\243\346\236\220\345\276\251\346\227\247.md" new file mode 100644 index 0000000..0cbd5ce --- /dev/null +++ "b/journals/20260219-0005-Vibrato\345\275\242\346\205\213\347\264\240\350\247\243\346\236\220\345\276\251\346\227\247.md" @@ -0,0 +1,50 @@ +# 20260219-0005-Vibrato形態素解析復旧 + +## 作業実施の理由 + +日本語の形態素解析(わかち書き)機能が、Lindera の辞書ダウンロード失敗(DNS/ネットワークの問題)によりビルドエラーとなっており、正規の日本語検索が利用できない状態だったため、より堅牢な Vibrato への切り替えを行い、機能を復旧させる。 + +## 指示内容 + +- **背景**: Lindera のビルドエラーにより、日本語の LSA 検索機能が停止していた。 +- **観点**: Windows 環境でのビルド安定性、外部ネットワーク不要の自己完結型バイナリ。 +- **意図**: 辞書をバイナリに埋め込み、どのような環境でも確実に日本語検索が動くようにする。 + +## 指摘事項とその対応 + +- **指摘**: 辞書ダウンロードの URL が 404 や HTML 取得になってしまう。 +- **対応**: GitHub の Release ページから `.tar.xz` アーカイブを取得し、内部の `system.dic.zst` を正しく抽出・配置する手順を確立した。 +- **指摘**: include_bytes! のパスエラー。 +- **対応**: ファイル位置からの正確な相対パスに修正し、コンパイルを通した。 + +## 作業詳細 + +1. AIエージェントは Lindera を削除し、`Cargo.toml` に `vibrato` と `zstd` を導入した。 +2. AIエージェントは `daac-tools/vibrato` のリリースから辞書アーカイブを救出し、`src-tauri/resources/ipadic.vibrato.zst` に配置した。 + - **取得元URL**: `https://github.com/daac-tools/vibrato/releases/download/v0.5.0/ipadic-mecab-2_7_0.tar.xz` + - **アーカイブ内パス**: `ipadic-mecab-2_7_0/system.dic.zst` +3. AIエージェントは `src/utils/tokenizer.rs` を全面的に刷新し、Vibrato による形態素解析ラッパーを実装した。 +4. AIエージェントは辞書読み込みロジックにて、解凍済みバイナリと圧縮バイナリの両方に対応するハイブリッド方式を採用し、堅牢性を高めた。 + +## Mermaid図解 + +```mermaid +sequenceDiagram + participant U as ユーザー + participant A as AIエージェント (Antigravity) + participant C as Compiler (Cargo) + participant D as Dictionary (Asset) + + A->>U: Vibrato への切り替えを提案 + U->>A: 承認 + A->>D: 辞書アーカイブのダウンロード + A->>A: アーカイブから辞書を抽出 + A->>A: tokenizer.rs の実装修正 + A->>C: cargo test 実行 + C-->>A: Test Passed (わかち書き成功) + A->>U: 作業完了報告 +``` + +## AI視点での結果 + +Vibrato への移行により、Lindera で発生していたネットワーク起因のビルドエラーを完全に解消した。辞書をバイナリに埋め込んだことで、ポータビリティが向上し、TelosDB の本来の目的である「どこでも動く軽量検索」が日本語でも実現可能となった。 diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index f5470b2..da6fce0 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -155,7 +155,7 @@ dependencies = [ "anyhow", "axum", - "bincode", + "bincode 1.3.3", "chrono", "dirs", "env_logger", @@ -177,6 +177,52 @@ "tokio-stream", "tower-http", "uuid", + "vibrato", + "zstd", +] + +[[package]] +name = "argmin" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760a49d596b18b881d2fe6e9e6da4608fa64d4a7653ef5cd43bfaa4da018d596" +dependencies = [ + "anyhow", + "argmin-math", + "instant", + "num-traits", + "paste", + "rand 0.8.5", + "rand_xoshiro", + "thiserror 1.0.69", +] + +[[package]] +name = "argmin-math" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d93a0d0269b60bd1cd674de70314e3f0da97406cf8c1936ce760d2a46e0f13fe" +dependencies = [ + "anyhow", + "cfg-if", + "num-complex", + "num-integer", + "num-traits", + "rand 0.8.5", + "thiserror 1.0.69", +] + +[[package]] +name = "argmin-observer-slog" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83d798c8ab61e6a55d161775f2ae8c42e56c15b746398878c36234575e6839aa" +dependencies = [ + "anyhow", + "argmin", + "slog", + "slog-async", + "slog-term", ] [[package]] @@ -371,6 +417,26 @@ ] [[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + +[[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -594,6 +660,8 @@ checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] @@ -757,6 +825,12 @@ ] [[package]] +name = "crawdad" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87fbd1ecd2ed790e11c8fbe034f9b3e7687404818d1bdfd8218d26ec645ec7c5" + +[[package]] name = "crc" version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -842,6 +916,15 @@ ] [[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] name = "ctor" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1146,6 +1229,15 @@ [[package]] name = "erased-serde" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c138974f9d5e7fe373eb04df7cae98833802ae4b11c24ac7039a21d5af4b26c" +dependencies = [ + "serde", +] + +[[package]] +name = "erased-serde" version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89e8918065695684b2b0702da20382d5ae6065cf3327bc2d6436bd49a71ce9f3" @@ -1837,6 +1929,12 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] name = "hex" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2198,6 +2296,15 @@ ] [[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + +[[package]] name = "ipnet" version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2223,6 +2330,17 @@ ] [[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.61.2", +] + +[[package]] name = "is-wsl" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2314,6 +2432,16 @@ checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" [[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] name = "js-sys" version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -3165,6 +3293,12 @@ ] [[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] name = "pathdiff" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -3683,6 +3817,15 @@ ] [[package]] +name = "rand_xoshiro" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa" +dependencies = [ + "rand_core 0.6.4", +] + +[[package]] name = "raw-window-handle" version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -3919,6 +4062,20 @@ ] [[package]] +name = "rucrf" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5f17d147c82c4a3d9f84274769a5a301b22b4f006b3d0f8a06570bac0acdc91" +dependencies = [ + "argmin", + "argmin-math", + "argmin-observer-slog", + "bincode 2.0.1", + "crossbeam-channel", + "hashbrown 0.15.5", +] + +[[package]] name = "rusqlite" version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -4253,7 +4410,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9faf48a4a2d2693be24c6289dbe26552776eb7737074e6722891fadbe6c5058" dependencies = [ - "erased-serde", + "erased-serde 0.4.9", "serde", "serde_core", "typeid", @@ -4529,6 +4686,44 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] +name = "slog" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b3b8565691b22d2bdfc066426ed48f837fc0c5f2c8cad8d9718f7f99d6995c1" +dependencies = [ + "anyhow", + "erased-serde 0.3.31", + "rustversion", + "serde_core", +] + +[[package]] +name = "slog-async" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72c8038f898a2c79507940990f05386455b3a317d8f18d4caea7cbc3d5096b84" +dependencies = [ + "crossbeam-channel", + "slog", + "take_mut", + "thread_local", +] + +[[package]] +name = "slog-term" +version = "2.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cb1fc680b38eed6fad4c02b3871c09d2c81db8c96aa4e9c0a34904c830f09b5" +dependencies = [ + "chrono", + "is-terminal", + "slog", + "term", + "thread_local", + "time", +] + +[[package]] name = "smallvec" version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -4977,6 +5172,12 @@ ] [[package]] +name = "take_mut" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" + +[[package]] name = "tao" version = "0.34.5" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -5339,6 +5540,15 @@ ] [[package]] +name = "term" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] name = "thiserror" version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -5379,6 +5589,15 @@ ] [[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] name = "time" version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -5811,6 +6030,12 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + +[[package]] name = "url" version = "2.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -5896,6 +6121,26 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] +name = "vibrato" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b02279291f68f435d84407eae0a99a8ac3dea4c2ff1f9efc86ab2431d13606bd" +dependencies = [ + "bincode 2.0.1", + "crawdad", + "csv-core", + "hashbrown 0.12.3", + "regex", + "rucrf", +] + +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + +[[package]] name = "vswhom" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -7016,3 +7261,31 @@ version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 0407fd5..cc77d86 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -42,7 +42,8 @@ reqwest = { version = "0.12", features = ["json"] } dirs = "6.0" # Japanese NLP & LSA -# lindera = { version = "2.2.0", features = ["ipadic"] } +vibrato = "0.5.2" +zstd = "0.13" ndarray = "0.15" # SVD は rsvd, ndarray-linalg なしで実施する方法を模索中 bincode = "1.3" diff --git a/src-tauri/resources/ipadic.vibrato.zst b/src-tauri/resources/ipadic.vibrato.zst new file mode 100644 index 0000000..230307b --- /dev/null +++ b/src-tauri/resources/ipadic.vibrato.zst Binary files differ diff --git a/src-tauri/src/utils/tokenizer.rs b/src-tauri/src/utils/tokenizer.rs index c7fe12e..a8e4b0f 100644 --- a/src-tauri/src/utils/tokenizer.rs +++ b/src-tauri/src/utils/tokenizer.rs @@ -1,26 +1,58 @@ -use anyhow::Result; +use anyhow::{Result, anyhow}; +use vibrato::Dictionary; +use vibrato::Tokenizer; +use std::io::Read; -pub struct JapaneseTokenizer; +pub struct JapaneseTokenizer { + tokenizer: Tokenizer, +} impl JapaneseTokenizer { pub fn new() -> Result { - Ok(JapaneseTokenizer) + // src-tauri/src/utils/tokenizer.rs から resources/ は ../../resources/ + // パスが見つからないエラーが出るため、絶対パスではなくソースコードの位置からの相対を再確認 + let dict_data = include_bytes!("../../resources/ipadic.vibrato.zst"); + + // 辞書の読み込み + // 8MB 程度あるなら解凍済みの可能性が高い。直接読み込みを試みる。 + let dict = match Dictionary::read(&dict_data[..]) { + Ok(d) => d, + Err(_) => { + // 直接読み込みに失敗した場合は zstd 解凍を試みる + let mut decoder = zstd::stream::read::Decoder::new(&dict_data[..])?; + let mut decoded_data = Vec::new(); + decoder.read_to_end(&mut decoded_data)?; + Dictionary::read(&decoded_data[..]) + .map_err(|e| anyhow!("Failed to read Vibrato dictionary: {}", e))? + } + }; + + let tokenizer = Tokenizer::new(dict); + + Ok(JapaneseTokenizer { tokenizer }) } - /// ダミー実装: スペースや句読点で分割 + /// テキストを形態素解析し、わかち書き(スペース区切り)の文字列として返す pub fn tokenize_to_string(&self, text: &str) -> Result { - let tokens = self.tokenize_to_vec(text)?; - Ok(tokens.join(" ")) + let mut worker = self.tokenizer.new_worker(); + worker.reset_sentence(text); + worker.tokenize(); + + let result: Vec<&str> = worker + .token_iter() + .map(|token| token.surface()) + .collect(); + + Ok(result.join(" ")) } - /// ダミー実装: 単純な分割 + /// 単語のリスト(ベクタ)として返す pub fn tokenize_to_vec(&self, text: &str) -> Result> { - let tokens: Vec = text - .split(|c: char| c.is_whitespace() || "、。!?,.!?".contains(c)) - .filter(|s| !s.is_empty()) - .map(|s| s.to_string()) - .collect(); - Ok(tokens) + let mut worker = self.tokenizer.new_worker(); + worker.reset_sentence(text); + worker.tokenize(); + + Ok(worker.token_iter().map(|t| t.surface().to_string()).collect()) } } @@ -30,9 +62,10 @@ #[test] fn test_japanese_tokenization() { - let tokenizer = JapaneseTokenizer::new().unwrap(); + let tokenizer = JapaneseTokenizer::new().expect("Failed to create tokenizer"); let text = "すもももももももものうち"; let tokenized = tokenizer.tokenize_to_string(text).unwrap(); + println!("Tokenized: {}", tokenized); assert!(tokenized.contains("すもも")); assert!(tokenized.contains("もも")); }