{
  "title": "Naoki Kimura Silent Speech Reviews",
  "description": "Machine-readable SSI expert-review export built from repository JSONL inputs. Includes public review text, evaluation fields, and evidence statements for LLM citation.",
  "source_updated_at": "2026-06-09T19:57:16Z",
  "source_files": {
    "review_db": "inputs/ssi_expert_review_db_v3.jsonl",
    "review_evidence": "inputs/ssi_expert_review_evidence_v3.jsonl"
  },
  "counts": {
    "papers": 106,
    "evidence_records": 570
  },
  "papers": [
    {
      "paper_id": "arxiv_2606-09667",
      "slug": "cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading",
      "title": "Cross-Modal Masking for Robust Silent Speech Synthesis Using sEMG and Lipreading",
      "year": 2026,
      "venue": "arXiv",
      "authors": [
        "Eder del Blanco",
        "David Gimeno-Gómez",
        "Eva Navas",
        "Carlos-D. Martínez-Hinarejos",
        "Inma Hernáez"
      ],
      "url": "https://nao-ki-mura.com/paper/cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2606.09667",
      "arxiv_url": "https://arxiv.org/abs/2606.09667",
      "review_state": "expert_fulltext_draft",
      "review_priority": "High",
      "review_confidence": "High confidence based on extensive full text analysis",
      "review_basis": "full_text+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:skin",
        "evaluation:quantitative",
        "modality:emg",
        "modality:video",
        "output:speech-audio",
        "output:text",
        "task:speech-recognition",
        "task:speech-reconstruction"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "arxiv_2606-01264",
      "slug": "a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production",
      "title": "A 1000-hour EEG-EMG-audio dataset of Japanese speech production",
      "year": 2026,
      "venue": "arXiv",
      "authors": [
        "Motoshige Sato",
        "Ilya Horiguchi",
        "Masakazu Inoue",
        "Kenichi Tomeoka",
        "Eri Hatakeyama",
        "Yuya Kita",
        "Atsushi Yamamoto",
        "Ippei Fujisawa",
        "Shuntaro Sasai"
      ],
      "url": "https://nao-ki-mura.com/paper/a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2606.01264",
      "arxiv_url": "https://arxiv.org/abs/2606.01264",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "body_site:skin",
        "evaluation:quantitative",
        "modality:eeg",
        "modality:emg"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "arxiv_2605-08075",
      "slug": "zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping",
      "title": "Zero-Shot Imagined Speech Decoding via Imagined-to-Listened MEG Mapping",
      "year": 2026,
      "venue": "arXiv",
      "authors": [
        "Maryam Maghsoudi",
        "Shihab Shamma"
      ],
      "url": "https://nao-ki-mura.com/paper/zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2605.08075",
      "arxiv_url": "https://arxiv.org/abs/2605.08075",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "modality:magnetic",
        "output:text",
        "task:speech-recognition"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "rekimoto2026_nasovoce",
      "slug": "nasovoce",
      "title": "NasoVoce: A Nose-Mounted Low-Audibility Speech Interface for Always-Available Speech Interaction",
      "year": 2026,
      "venue": "CHI '26 / arXiv",
      "authors": [
        "Jun Rekimoto",
        "Yu Nishimura",
        "Bojian Yang"
      ],
      "url": "https://nao-ki-mura.com/paper/nasovoce",
      "doi": "10.1145/3772318.3791397",
      "doi_url": "https://doi.org/10.1145/3772318.3791397",
      "arxiv_id": "2603.10324",
      "arxiv_url": "https://arxiv.org/abs/2603.10324",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+existing_expert_seed",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:nose",
        "body_site:oral-cavity",
        "deployment:hands-free",
        "deployment:mobile-suitable",
        "deployment:wearable",
        "evaluation:quantitative",
        "modality:acoustic",
        "modality:microphone",
        "modality:multimodal",
        "modality:vibration",
        "output:speech-audio"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "arxiv_2511-21740",
      "slug": "a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding",
      "title": "A cross-species neural foundation model for end-to-end speech decoding",
      "year": 2025,
      "venue": "arXiv",
      "authors": [
        "Yizi Zhang",
        "Linyang He",
        "Chaofei Fan",
        "Tingkai Liu",
        "Han Yu",
        "Trung Le",
        "Jingyuan Li",
        "Scott Linderman",
        "Lea Duncker",
        "Francis R Willett",
        "Nima Mesgarani",
        "Liam Paninski"
      ],
      "url": "https://nao-ki-mura.com/paper/a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2511.21740",
      "arxiv_url": "https://arxiv.org/abs/2511.21740",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "output:text",
        "task:speech-recognition"
      ],
      "evidence_count": 11
    },
    {
      "paper_id": "ssi_sonicvisionlm-playing-sound-with-vision-language-models",
      "slug": "sonicvisionlm-playing-sound-with-vision-language-models",
      "title": "SonicVisionLM: Playing Sound with Vision Language Models",
      "year": 2024,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zhifeng Xie",
        "Shengye Yu",
        "Mengtian Li",
        "Qile He",
        "Chaofeng Chen",
        "Yugang Jiang"
      ],
      "url": "https://nao-ki-mura.com/paper/sonicvisionlm-playing-sound-with-vision-language-models",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2401.04394",
      "arxiv_url": "https://arxiv.org/abs/2401.04394",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:multimodal",
        "modality:video",
        "output:audio",
        "task:dataset"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases",
      "slug": "ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases",
      "title": "IR-UWB Radar-Based Contactless Silent Speech Recognition of Vowels, Consonants, Words, and Phrases",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases",
      "doi": "10.1109/ACCESS.2023.3344177",
      "doi_url": "https://doi.org/10.1109/ACCESS.2023.3344177",
      "arxiv_id": "2312.09572",
      "arxiv_url": "https://arxiv.org/abs/2312.09572",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:radar",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "deployment:hands-free",
        "deployment:mobile-suitable"
      ],
      "evidence_count": 9
    },
    {
      "paper_id": "ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency",
      "slug": "ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency",
      "title": "Ultrasensitive Textile Strain Sensors Redefine Wearable Silent Speech Interfaces with High Machine Learning Efficiency",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Chenyu Tang",
        "Muzi Xu",
        "Wentian Yi",
        "Zibo Zhang",
        "Edoardo Occhipinti",
        "Chaoqun Dong",
        "Dafydd Ravenscroft",
        "Sung-Min Jung",
        "Shuo Gao",
        "Jong Min Kim",
        "Luigi G. Occhipinti"
      ],
      "url": "https://nao-ki-mura.com/paper/ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2311.15683",
      "arxiv_url": "https://arxiv.org/abs/2311.15683",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:throat",
        "modality:vibration",
        "task:command-recognition",
        "output:labels",
        "deployment:hands-free",
        "deployment:real-time",
        "deployment:speaker-dependent",
        "deployment:wearable",
        "evaluation:quantitative"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation",
      "slug": "distributed-pressure-matching-strategy-using-diffusion-adaptation",
      "title": "Distributed pressure matching strategy using diffusion adaptation",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/distributed-pressure-matching-strategy-using-diffusion-adaptation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2311.07729",
      "arxiv_url": "https://arxiv.org/abs/2311.07729",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "modality:microphone",
        "output:audio",
        "evaluation:quantitative"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts",
      "slug": "advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts",
      "title": "Advancing Test-Time Adaptation for Acoustic Foundation Models in Open-World Shifts",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Hongfu Liu",
        "Hengguan Huang",
        "Ye Wang"
      ],
      "url": "https://nao-ki-mura.com/paper/advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2310.09505",
      "arxiv_url": "https://arxiv.org/abs/2310.09505",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "modality:acoustic",
        "output:text",
        "task:speech-recognition"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_sound-source-localization-is-all-about-cross-modal-alignment",
      "slug": "sound-source-localization-is-all-about-cross-modal-alignment",
      "title": "Sound Source Localization is All about Cross-Modal Alignment",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Arda Senocak",
        "Hyeonggon Ryu",
        "Junsik Kim",
        "Tae-Hyun Oh",
        "Hanspeter Pfister",
        "Joon Son Chung"
      ],
      "url": "https://nao-ki-mura.com/paper/sound-source-localization-is-all-about-cross-modal-alignment",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2309.10724",
      "arxiv_url": "https://arxiv.org/abs/2309.10724",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:multimodal",
        "output:labels",
        "evaluation:quantitative"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos",
      "slug": "let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos",
      "title": "Let There Be Sound: Reconstructing High Quality Speech from Silent Videos",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ji-Hoon Kim",
        "Jaehun Kim",
        "Joon Son Chung"
      ],
      "url": "https://nao-ki-mura.com/paper/let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.15256",
      "arxiv_url": "https://arxiv.org/abs/2308.15256",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video",
      "slug": "an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video",
      "title": "An Initial Exploration: Learning to Generate Realistic Audio for Silent Video",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Matthew Martel",
        "Jackson Wagner"
      ],
      "url": "https://nao-ki-mura.com/paper/an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.12408",
      "arxiv_url": "https://arxiv.org/abs/2308.12408",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "medium-high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "modality:video",
        "output:audio"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model",
      "slug": "akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model",
      "title": "Audio Knowledge Empowered Visual Speech Recognition",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Jeong Hun Yeo",
        "Minsu Kim",
        "Jeongsoo Choi",
        "Dae Hoe Kim",
        "Yong Man Ro"
      ],
      "url": "https://nao-ki-mura.com/paper/akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.07593",
      "arxiv_url": "https://arxiv.org/abs/2308.07593",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:text",
        "task:speech-recognition"
      ],
      "evidence_count": 9
    },
    {
      "paper_id": "ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface",
      "slug": "knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface",
      "title": "Knowledge Distilled Ensemble Model for sEMG-based Silent Speech Interface",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.06533",
      "arxiv_url": "https://arxiv.org/abs/2308.06533",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "modality:emg",
        "output:text",
        "task:text-entry",
        "evaluation:quantitative",
        "deployment:hands-free",
        "deployment:wearable"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data",
      "slug": "automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data",
      "title": "Automatically measuring speech fluency in people with aphasia: first achievements using read-speech data",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data",
      "doi": "10.1080/02687038.2023.2244728",
      "doi_url": "https://doi.org/10.1080/02687038.2023.2244728",
      "arxiv_id": "2308.04763",
      "arxiv_url": "https://arxiv.org/abs/2308.04763",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "modality:microphone",
        "output:labels",
        "evaluation:quantitative"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_exploring-how-a-generative-ai-interprets-music",
      "slug": "exploring-how-a-generative-ai-interprets-music",
      "title": "Exploring how a Generative AI interprets music",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/exploring-how-a-generative-ai-interprets-music",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.00015",
      "arxiv_url": "https://arxiv.org/abs/2308.00015",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:audio",
        "evaluation:quantitative"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio",
      "slug": "audio-visual-video-to-speech-synthesis-with-synthesized-input-audio",
      "title": "Audio-visual video-to-speech synthesis with synthesized input audio",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Triantafyllos Kefalas",
        "Yannis Panagakis",
        "Maja Pantic"
      ],
      "url": "https://nao-ki-mura.com/paper/audio-visual-video-to-speech-synthesis-with-synthesized-input-audio",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2307.16584",
      "arxiv_url": "https://arxiv.org/abs/2307.16584",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:multimodal",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation",
      "slug": "audio-aware-query-enhanced-transformer-for-audio-visual-segmentation",
      "title": "Audio-aware Query-enhanced Transformer for Audio-Visual Segmentation",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Jinxiang Liu",
        "Chen Ju",
        "Chaofan Ma",
        "Yanfeng Wang",
        "Yu Wang",
        "Ya Zhang"
      ],
      "url": "https://nao-ki-mura.com/paper/audio-aware-query-enhanced-transformer-for-audio-visual-segmentation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2307.13236",
      "arxiv_url": "https://arxiv.org/abs/2307.13236",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "modality:multimodal",
        "modality:video",
        "output:labels"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations",
      "slug": "robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations",
      "title": "RobustL2S: Speaker-Specific Lip-to-Speech Synthesis exploiting Self-Supervised Representations",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Neha Sahipjohn",
        "Neil Shah",
        "Vishal Tambrahalli",
        "Vineet Gandhi"
      ],
      "url": "https://nao-ki-mura.com/paper/robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2307.01233",
      "arxiv_url": "https://arxiv.org/abs/2307.01233",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models",
      "slug": "diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models",
      "title": "Diff-Foley: Synchronized Video-to-Audio Synthesis with Latent Diffusion Models",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Chuanhao Luo",
        "Chenxu Yan",
        "Hang Hu",
        "et al."
      ],
      "url": "https://nao-ki-mura.com/paper/diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2306.17203",
      "arxiv_url": "https://arxiv.org/abs/2306.17203",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "modality:multimodal",
        "modality:video",
        "output:audio"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units",
      "slug": "high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units",
      "title": "High-Quality Automatic Voice Over with Accurate Alignment: Supervision through Self-Supervised Discrete Speech Units",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Junchen Lu",
        "Berrak Sisman",
        "Mingyang Zhang",
        "Haizhou Li"
      ],
      "url": "https://nao-ki-mura.com/paper/high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2306.17005",
      "arxiv_url": "https://arxiv.org/abs/2306.17005",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "evaluation:quantitative"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis",
      "slug": "large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis",
      "title": "Large-scale unsupervised audio pre-training for video-to-speech synthesis",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2306.15464",
      "arxiv_url": "https://arxiv.org/abs/2306.15464",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:speaker-independent"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading",
      "slug": "lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading",
      "title": "LipVoicer: Generating Speech from Silent Videos Guided by Lip Reading",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yochai Yemini",
        "Aviv Shamsian",
        "Lior Bracha",
        "Sharon Gannot",
        "Ethan Fetaya"
      ],
      "url": "https://nao-ki-mura.com/paper/lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2306.03258",
      "arxiv_url": "https://arxiv.org/abs/2306.03258",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_intelligible-lip-to-speech-synthesis-with-speech-units",
      "slug": "intelligible-lip-to-speech-synthesis-with-speech-units",
      "title": "Intelligible Lip-to-Speech Synthesis with Speech Units",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/intelligible-lip-to-speech-synthesis-with-speech-units",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2305.19603",
      "arxiv_url": "https://arxiv.org/abs/2305.19603",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:speaker-independent"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks",
      "slug": "adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks",
      "title": "Adaptation of Tongue Ultrasound-Based Silent Speech Interfaces Using Spatial Transformer Networks",
      "year": 2023,
      "venue": "the Proceedings of Interspeech 2023",
      "authors": [
        "László Tóth",
        "Amin Honarmandi Shandiz",
        "Gábor Gosztolya",
        "Tamás Gábor Csapó"
      ],
      "url": "https://nao-ki-mura.com/paper/adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks",
      "doi": "10.21437/Interspeech.2023-1607",
      "doi_url": "https://doi.org/10.21437/Interspeech.2023-1607",
      "arxiv_id": "2305.19130",
      "arxiv_url": "https://arxiv.org/abs/2305.19130",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "deployment:speaker-dependent",
        "deployment:wearable",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control",
      "slug": "zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control",
      "title": "Zero-shot personalized lip-to-speech synthesis with face image based voice control",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zheng-Yan Sheng",
        "Yang Ai",
        "Zhen-Hua Ling"
      ],
      "url": "https://nao-ki-mura.com/paper/zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2305.14359",
      "arxiv_url": "https://arxiv.org/abs/2305.14359",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "task:speech-reconstruction",
        "output:speech-audio",
        "deployment:speaker-independent",
        "evaluation:quantitative"
      ],
      "evidence_count": 12
    },
    {
      "paper_id": "ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning",
      "slug": "improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning",
      "title": "Improving the Gap in Visual Speech Recognition Between Normal and Silent Speech Based on Metric Learning",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning",
      "doi": "10.21437/Interspeech.2023-370",
      "doi_url": "https://doi.org/10.21437/Interspeech.2023-370",
      "arxiv_id": "2305.14203",
      "arxiv_url": "https://arxiv.org/abs/2305.14203",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "deployment:speaker-independent"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_conditional-generation-of-audio-from-video-via-foley-analogies",
      "slug": "conditional-generation-of-audio-from-video-via-foley-analogies",
      "title": "Conditional Generation of Audio from Video via Foley Analogies",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yuexi Du",
        "Ziyang Chen",
        "Justin Salamon",
        "Bryan Russell",
        "Andrew Owens"
      ],
      "url": "https://nao-ki-mura.com/paper/conditional-generation-of-audio-from-video-via-foley-analogies",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2304.08490",
      "arxiv_url": "https://arxiv.org/abs/2304.08490",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "modality:multimodal",
        "modality:video",
        "output:audio"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training",
      "slug": "speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training",
      "title": "Speech Reconstruction from Silent Tongue and Lip Articulation By Pseudo Target Generation and Domain Adversarial Training",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ruichen Zheng",
        "Yang Ai",
        "Zhenhua Ling"
      ],
      "url": "https://nao-ki-mura.com/paper/speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2304.05574",
      "arxiv_url": "https://arxiv.org/abs/2304.05574",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:oral-cavity",
        "body_site:tongue",
        "evaluation:quantitative",
        "modality:multimodal",
        "modality:ultrasound",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions",
      "slug": "wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions",
      "title": "WESPER: Zero-shot and Realtime Whisper to Normal Voice Conversion for Whisper-based Speech Interactions",
      "year": 2023,
      "venue": "Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems (CHI '23), April 23--28, 2023",
      "authors": [
        "Jun Rekimoto"
      ],
      "url": "https://nao-ki-mura.com/paper/wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions",
      "doi": "10.1145/3544548.3580706",
      "doi_url": "https://doi.org/10.1145/3544548.3580706",
      "arxiv_id": "2303.01639",
      "arxiv_url": "https://arxiv.org/abs/2303.01639",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "deployment:real-time",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech",
      "slug": "duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech",
      "title": "Duration-aware pause insertion using pre-trained language model for multi-speaker text-to-speech",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Dong Yang",
        "Tomoki Koriyama",
        "Yuki Saito",
        "Takaaki Saeki",
        "Detai Xin",
        "Hiroshi Saruwatari"
      ],
      "url": "https://nao-ki-mura.com/paper/duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2302.13652",
      "arxiv_url": "https://arxiv.org/abs/2302.13652",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "output:speech-audio",
        "evaluation:quantitative"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices",
      "slug": "liplearner-customizable-silent-speech-interactions-on-mobile-devices",
      "title": "LipLearner: Customizable Silent Speech Interactions on Mobile Devices",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zixiong Su",
        "Shitao Fang",
        "Jun Rekimoto"
      ],
      "url": "https://nao-ki-mura.com/paper/liplearner-customizable-silent-speech-interactions-on-mobile-devices",
      "doi": "10.1145/3544548.3581465",
      "doi_url": "https://doi.org/10.1145/3544548.3581465",
      "arxiv_id": "2302.05907",
      "arxiv_url": "https://arxiv.org/abs/2302.05907",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "deployment:mobile-suitable",
        "deployment:real-time",
        "evaluation:quantitative",
        "evaluation:walking-tested",
        "modality:video",
        "output:commands",
        "task:command-recognition"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech",
      "slug": "towards-neural-decoding-of-imagined-speech-based-on-spoken-speech",
      "title": "Towards Neural Decoding of Imagined Speech based on Spoken Speech",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Seo-Hyun Lee",
        "Young-Eun Lee",
        "Soowon Kim",
        "Byung-Kwan Ko",
        "Seong-Whan Lee"
      ],
      "url": "https://nao-ki-mura.com/paper/towards-neural-decoding-of-imagined-speech-based-on-spoken-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2212.02047",
      "arxiv_url": "https://arxiv.org/abs/2212.02047",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium-high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "modality:eeg",
        "output:labels",
        "task:speech-recognition",
        "evaluation:quantitative"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation",
      "slug": "breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation",
      "title": "Breaking the trade-off in personalized speech enhancement with cross-task knowledge distillation",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Hassan Taherian",
        "Seif Emre Eskimez",
        "Takuya Yoshioka"
      ],
      "url": "https://nao-ki-mura.com/paper/breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2211.02944",
      "arxiv_url": "https://arxiv.org/abs/2211.02944",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar",
      "slug": "movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar",
      "title": "Movement Detection of Tongue and Related Body Parts Using IR-UWB Radar",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Sunghwa Lee",
        "Younghoon Shin"
      ],
      "url": "https://nao-ki-mura.com/paper/movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar",
      "doi": "10.1109/ICTC55196.2022.9952644",
      "doi_url": "https://doi.org/10.1109/ICTC55196.2022.9952644",
      "arxiv_id": "2209.01762",
      "arxiv_url": "https://arxiv.org/abs/2209.01762",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "deployment:hands-free",
        "evaluation:quantitative",
        "modality:radar",
        "output:labels",
        "task:speech-recognition"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild",
      "slug": "lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild",
      "title": "Lip-to-Speech Synthesis for Arbitrary Speakers in the Wild",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Sindhu B Hegde",
        "K R Prajwal",
        "Rudrabha Mukhopadhyay",
        "Vinay P Namboodiri",
        "C. V. Jawahar"
      ],
      "url": "https://nao-ki-mura.com/paper/lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild",
      "doi": "10.1145/3503161.3548081",
      "doi_url": "https://doi.org/10.1145/3503161.3548081",
      "arxiv_id": "2209.00642",
      "arxiv_url": "https://arxiv.org/abs/2209.00642",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:hands-free",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting",
      "slug": "an-anchor-free-detector-for-continuous-speech-keyword-spotting",
      "title": "An Anchor-Free Detector for Continuous Speech Keyword Spotting",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zhiyuan Zhao",
        "Chuanxin Tang",
        "Chengdong Yao",
        "Chong Luo"
      ],
      "url": "https://nao-ki-mura.com/paper/an-anchor-free-detector-for-continuous-speech-keyword-spotting",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2208.04622",
      "arxiv_url": "https://arxiv.org/abs/2208.04622",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:labels",
        "task:speech-recognition"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis",
      "slug": "fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis",
      "title": "FastLTS: Non-Autoregressive End-to-End Unconstrained Lip-to-Speech Synthesis",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yongqi Wang",
        "Zhou Zhao"
      ],
      "url": "https://nao-ki-mura.com/paper/fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis",
      "doi": "10.1145/3503161.3548194",
      "doi_url": "https://doi.org/10.1145/3503161.3548194",
      "arxiv_id": "2207.03800",
      "arxiv_url": "https://arxiv.org/abs/2207.03800",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks",
      "slug": "improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks",
      "title": "Improved Processing of Ultrasound Tongue Videos by Combining ConvLSTM and 3D Convolutional Networks",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Amin Honarmandi Shandiz",
        "László Tóth"
      ],
      "url": "https://nao-ki-mura.com/paper/improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2206.12947",
      "arxiv_url": "https://arxiv.org/abs/2206.12947",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "task:speech-reconstruction",
        "output:speech-audio",
        "deployment:speaker-dependent",
        "evaluation:quantitative"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection",
      "slug": "visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection",
      "title": "VisageSynTalk: Unseen Speaker Video-to-Speech Synthesis via Speech-Visage Feature Selection",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Joanna Hong",
        "Minsu Kim",
        "Yong Man Ro"
      ],
      "url": "https://nao-ki-mura.com/paper/visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2206.07458",
      "arxiv_url": "https://arxiv.org/abs/2206.07458",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:speaker-independent"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information",
      "slug": "silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information",
      "title": "Silence is Sweeter Than Speech: Self-Supervised Model Using Silence to Store Speaker Information",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Chi-Luen Feng",
        "Po-chun Hsu",
        "Hung-yi Lee"
      ],
      "url": "https://nao-ki-mura.com/paper/silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2205.03759",
      "arxiv_url": "https://arxiv.org/abs/2205.03759",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:labels",
        "evaluation:quantitative"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_svts-scalable-video-to-speech-synthesis",
      "slug": "svts-scalable-video-to-speech-synthesis",
      "title": "SVTS: Scalable Video-to-Speech Synthesis",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Rodrigo Mira",
        "Alexandros Haliassos",
        "Stavros Petridis",
        "Björn W. Schuller",
        "Maja Pantic"
      ],
      "url": "https://nao-ki-mura.com/paper/svts-scalable-video-to-speech-synthesis",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2205.02058",
      "arxiv_url": "https://arxiv.org/abs/2205.02058",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "task:speech-reconstruction",
        "output:speech-audio",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:unseen-words"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms",
      "slug": "listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms",
      "title": "Listen only to me! How well can target speech extraction handle false alarms?",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Marc Delcroix",
        "Keisuke Kinoshita",
        "Tsubasa Ochiai",
        "Katerina Zmolikova",
        "Hiroshi Sato",
        "Tomohiro Nakatani"
      ],
      "url": "https://nao-ki-mura.com/paper/listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2204.04811",
      "arxiv_url": "https://arxiv.org/abs/2204.04811",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video",
      "slug": "multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video",
      "title": "Multi-modality Associative Bridging through Memory: Speech Sound Recollected from Face Video",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Minsu Kim",
        "Joanna Hong",
        "Se Jin Park",
        "Yong Man Ro"
      ],
      "url": "https://nao-ki-mura.com/paper/multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2204.01265",
      "arxiv_url": "https://arxiv.org/abs/2204.01265",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:multimodal",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion",
      "slug": "vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion",
      "title": "VCVTS: Multi-speaker Video-to-Speech synthesis via cross-modal knowledge transfer from voice conversion",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Disong Wang",
        "Shan Yang",
        "Dan Su",
        "Xunying Liu",
        "Dong Yu",
        "Helen Meng"
      ],
      "url": "https://nao-ki-mura.com/paper/vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2202.09081",
      "arxiv_url": "https://arxiv.org/abs/2202.09081",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals",
      "slug": "supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals",
      "title": "Supervised and Self-supervised Pretraining Based COVID-19 Detection Using Acoustic Breathing/Cough/Speech Signals",
      "year": 2022,
      "venue": "ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2022, pp. 561-565",
      "authors": [
        "Xing-Yu Chen",
        "Qiu-Shi Zhu",
        "Jie Zhang",
        "Li-Rong Dai"
      ],
      "url": "https://nao-ki-mura.com/paper/supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals",
      "doi": "10.1109/ICASSP43922.2022.9746205",
      "doi_url": "https://doi.org/10.1109/ICASSP43922.2022.9746205",
      "arxiv_id": "2201.08934",
      "arxiv_url": "https://arxiv.org/abs/2201.08934",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:labels",
        "task:audio-classification"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over",
      "slug": "visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over",
      "title": "VisualTTS: TTS with Accurate Lip-Speech Synchronization for Automatic Voice Over",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Junchen Lu",
        "Berrak Sisman",
        "Rui Liu",
        "Mingyang Zhang",
        "Haizhou Li"
      ],
      "url": "https://nao-ki-mura.com/paper/visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2110.03342",
      "arxiv_url": "https://arxiv.org/abs/2110.03342",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 7
    },
    {
      "paper_id": "ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language",
      "slug": "sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language",
      "title": "Sequence-to-Sequence Voice Reconstruction for Silent Speech in a Tonal Language",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Huiyan Li",
        "Haohong Lin",
        "You Wang",
        "Hengyang Wang",
        "Ming Zhang",
        "Han Gao",
        "Qing Ai",
        "Zhiyuan Luo",
        "Guang Li"
      ],
      "url": "https://nao-ki-mura.com/paper/sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2108.00190",
      "arxiv_url": "https://arxiv.org/abs/2108.00190",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "deployment:hands-free",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:emg",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 10
    },
    {
      "paper_id": "kimura2022_silentspeller",
      "slug": "silentspeller",
      "title": "SilentSpeller: Towards mobile, hands-free, silent speech text entry using electropalatography",
      "year": 2022,
      "venue": "CHI '22",
      "authors": [
        "Naoki Kimura",
        "Tan Gemicioglu",
        "Jonathan Womack",
        "Yuhui Zhao",
        "Richard Li",
        "Abdelkareem Bedri",
        "Zixiong Su",
        "Alex Olwal",
        "Jun Rekimoto",
        "Thad Starner"
      ],
      "url": "https://nao-ki-mura.com/paper/silentspeller",
      "doi": "10.1145/3491102.3502015",
      "doi_url": "https://doi.org/10.1145/3491102.3502015",
      "arxiv_id": "",
      "arxiv_url": "",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+existing_expert_seed",
      "source_coverage": "high",
      "tags": [
        "body_site:palate",
        "body_site:tongue",
        "deployment:hands-free",
        "deployment:mobile-suitable",
        "deployment:real-time",
        "deployment:speaker-dependent",
        "deployment:wearable",
        "evaluation:quantitative",
        "evaluation:unseen-words",
        "evaluation:walking-tested",
        "modality:electropalatography",
        "output:text",
        "task:text-entry"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data",
      "slug": "sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data",
      "title": "SA-SDR: A novel loss function for separation of meeting style data",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Thilo von Neumann",
        "Keisuke Kinoshita",
        "Christoph Boeddeker",
        "Marc Delcroix",
        "Reinhold Haeb-Umbach"
      ],
      "url": "https://nao-ki-mura.com/paper/sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2110.15581",
      "arxiv_url": "https://arxiv.org/abs/2110.15581",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_advances-and-challenges-in-deep-lip-reading",
      "slug": "advances-and-challenges-in-deep-lip-reading",
      "title": "Advances and Challenges in Deep Lip Reading",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Marzieh Oghbaie",
        "Arian Sabaghi",
        "Kooshan Hashemifard",
        "Mohammad Akbari"
      ],
      "url": "https://nao-ki-mura.com/paper/advances-and-challenges-in-deep-lip-reading",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2110.07879",
      "arxiv_url": "https://arxiv.org/abs/2110.07879",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "task:survey"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_sub-word-level-lip-reading-with-visual-attention",
      "slug": "sub-word-level-lip-reading-with-visual-attention",
      "title": "Sub-word Level Lip Reading With Visual Attention",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "K R Prajwal",
        "Triantafyllos Afouras",
        "Andrew Zisserman"
      ],
      "url": "https://nao-ki-mura.com/paper/sub-word-level-lip-reading-with-visual-attention",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2110.07603",
      "arxiv_url": "https://arxiv.org/abs/2110.07603",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:hands-free",
        "evaluation:unseen-words"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input",
      "slug": "speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input",
      "title": "Speech Synthesis from Text and Ultrasound Tongue Image-based Articulatory Input",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Tamas Gabor Csapo",
        "Laszlo Toth",
        "Gabor Gosztolya",
        "Alexandra Marko"
      ],
      "url": "https://nao-ki-mura.com/paper/speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2107.02003",
      "arxiv_url": "https://arxiv.org/abs/2107.02003",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "deployment:hands-free",
        "deployment:wearable",
        "deployment:speaker-dependent"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits",
      "slug": "sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits",
      "title": "Sparsely Overlapped Speech Training in the Time Domain: Joint Learning of Target Speech Separation and Personal VAD Benefits",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Qingjian Lin",
        "Lin Yang",
        "Xuyang Wang",
        "Luyuan Xie",
        "Chen Jia",
        "Junjie Wang"
      ],
      "url": "https://nao-ki-mura.com/paper/sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2106.14371",
      "arxiv_url": "https://arxiv.org/abs/2106.14371",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri",
      "slug": "silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri",
      "title": "Silent Speech and Emotion Recognition from Vocal Tract Shape Dynamics in Real-Time MRI",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Laxmi Pandey",
        "Ahmed Sabbir Arif"
      ],
      "url": "https://nao-ki-mura.com/paper/silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2106.08706",
      "arxiv_url": "https://arxiv.org/abs/2106.08706",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:oral-cavity",
        "body_site:palate",
        "body_site:throat",
        "body_site:tongue",
        "modality:magnetic",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:hands-free"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces",
      "slug": "neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces",
      "title": "Neural Speaker Embeddings for Ultrasound-based Silent Speech Interfaces",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Amin Honarmandi Shandiz",
        "László Tóth",
        "Gábor Gosztolya",
        "Alexandra Markó",
        "Tamás Gábor Csapó"
      ],
      "url": "https://nao-ki-mura.com/paper/neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2106.04552",
      "arxiv_url": "https://arxiv.org/abs/2106.04552",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative"
      ],
      "evidence_count": 7
    },
    {
      "paper_id": "ssi_an-improved-model-for-voicing-silent-speech",
      "slug": "an-improved-model-for-voicing-silent-speech",
      "title": "An Improved Model for Voicing Silent Speech",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "David Gaddy",
        "Dan Klein"
      ],
      "url": "https://nao-ki-mura.com/paper/an-improved-model-for-voicing-silent-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2106.01933",
      "arxiv_url": "https://arxiv.org/abs/2106.01933",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "modality:emg",
        "output:speech-audio",
        "task:speech-reconstruction",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:unseen-words"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks",
      "slug": "voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks",
      "title": "Voice Activity Detection for Ultrasound-based Silent Speech Interfaces using Convolutional Neural Networks",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Amin Honarmandi Shandiz",
        "Laszlo Toth"
      ],
      "url": "https://nao-ki-mura.com/paper/voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks",
      "doi": "10.1007/978-3-030-83527-9_43",
      "doi_url": "https://doi.org/10.1007/978-3-030-83527-9_43",
      "arxiv_id": "2105.13718",
      "arxiv_url": "https://arxiv.org/abs/2105.13718",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:labels",
        "task:speech-enhancement",
        "evaluation:quantitative",
        "deployment:hands-free",
        "deployment:speaker-dependent"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_speaker-disentanglement-in-video-to-speech-conversion",
      "slug": "speaker-disentanglement-in-video-to-speech-conversion",
      "title": "Speaker disentanglement in video-to-speech conversion",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Dan Oneata",
        "Adriana Stan",
        "Horia Cucu"
      ],
      "url": "https://nao-ki-mura.com/paper/speaker-disentanglement-in-video-to-speech-conversion",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2105.09652",
      "arxiv_url": "https://arxiv.org/abs/2105.09652",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:hands-free",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_improving-neural-silent-speech-interface-models-by-adversarial-training",
      "slug": "improving-neural-silent-speech-interface-models-by-adversarial-training",
      "title": "Improving Neural Silent Speech Interface Models by Adversarial Training",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Amin Honarmandi Shandiz",
        "László Tóth",
        "Gábor Gosztolya",
        "Alexandra Markó",
        "Tamás Gábor Csapó"
      ],
      "url": "https://nao-ki-mura.com/paper/improving-neural-silent-speech-interface-models-by-adversarial-training",
      "doi": "10.1007/978-3-030-76346-6_39",
      "doi_url": "https://doi.org/10.1007/978-3-030-76346-6_39",
      "arxiv_id": "2104.11601",
      "arxiv_url": "https://arxiv.org/abs/2104.11601",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "deployment:speaker-dependent",
        "evaluation:quantitative"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces",
      "slug": "3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces",
      "title": "3D Convolutional Neural Networks for Ultrasound-Based Silent Speech Interfaces",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "László Tóth",
        "Amin Honarmandi Shandiz"
      ],
      "url": "https://nao-ki-mura.com/paper/3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces",
      "doi": "10.1007/978-3-030-61401-0_16",
      "doi_url": "https://doi.org/10.1007/978-3-030-61401-0_16",
      "arxiv_id": "2104.11532",
      "arxiv_url": "https://arxiv.org/abs/2104.11532",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "deployment:speaker-dependent"
      ],
      "evidence_count": 7
    },
    {
      "paper_id": "ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation",
      "slug": "htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation",
      "title": "HTMD-Net: A Hybrid Masking-Denoising Approach to Time-Domain Monaural Singing Voice Separation",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Christos Garoufis",
        "Athanasia Zlatintsi",
        "Petros Maragos"
      ],
      "url": "https://nao-ki-mura.com/paper/htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2103.04336",
      "arxiv_url": "https://arxiv.org/abs/2103.04336",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "medium-high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:audio",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "evidence_count": 7
    },
    {
      "paper_id": "ssi_silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video",
      "slug": "silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video",
      "title": "Silent versus modal multi-speaker speech recognition from ultrasound and video",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Manuel Sam Ribeiro",
        "Aciel Eshky",
        "Korin Richmond",
        "Steve Renals"
      ],
      "url": "https://nao-ki-mura.com/paper/silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2103.00333",
      "arxiv_url": "https://arxiv.org/abs/2103.00333",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:tongue",
        "modality:multimodal",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:hands-free",
        "deployment:speaker-independent"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system",
      "slug": "ema2s-an-end-to-end-multimodal-articulatory-to-speech-system",
      "title": "EMA2S: An End-to-End Multimodal Articulatory-to-Speech System",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yu-Wen Chen",
        "Kuo-Hsuan Hung",
        "Shang-Yi Chuang",
        "Jonathan Sherman",
        "Wen-Chin Huang",
        "Xugang Lu",
        "Yu Tsao"
      ],
      "url": "https://nao-ki-mura.com/paper/ema2s-an-end-to-end-multimodal-articulatory-to-speech-system",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2102.03786",
      "arxiv_url": "https://arxiv.org/abs/2102.03786",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:jaw",
        "body_site:lip",
        "body_site:tongue",
        "modality:magnetic",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative"
      ],
      "evidence_count": 9
    },
    {
      "paper_id": "ssi_convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image",
      "slug": "convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image",
      "title": "Convolutional Neural Network-Based Age Estimation Using B-Mode Ultrasound Tongue Image",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Kele Xu",
        "Tamas Gabor Csapo",
        "Ming Feng"
      ],
      "url": "https://nao-ki-mura.com/paper/convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2101.11245",
      "arxiv_url": "https://arxiv.org/abs/2101.11245",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:labels",
        "evaluation:quantitative"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_end-to-end-silent-speech-recognition-with-acoustic-sensing",
      "slug": "end-to-end-silent-speech-recognition-with-acoustic-sensing",
      "title": "End-to-end Silent Speech Recognition with Acoustic Sensing",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/end-to-end-silent-speech-recognition-with-acoustic-sensing",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2011.11315",
      "arxiv_url": "https://arxiv.org/abs/2011.11315",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:acoustic",
        "modality:microphone",
        "task:speech-recognition",
        "output:text",
        "deployment:mobile-suitable",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:unseen-words"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_speech-prediction-in-silent-videos-using-variational-autoencoders",
      "slug": "speech-prediction-in-silent-videos-using-variational-autoencoders",
      "title": "Speech Prediction in Silent Videos using Variational Autoencoders",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ravindra Yadav",
        "Ashish Sardana",
        "Vinay P Namboodiri",
        "Rajesh M Hegde"
      ],
      "url": "https://nao-ki-mura.com/paper/speech-prediction-in-silent-videos-using-variational-autoencoders",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2011.07340",
      "arxiv_url": "https://arxiv.org/abs/2011.07340",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network",
      "slug": "x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network",
      "title": "X-TaSNet: Robust and Accurate Time-Domain Speaker Extraction Network",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zining Zhang",
        "Bingsheng He",
        "Zhenjie Zhang"
      ],
      "url": "https://nao-ki-mura.com/paper/x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2010.12766",
      "arxiv_url": "https://arxiv.org/abs/2010.12766",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement"
      ],
      "evidence_count": 7
    },
    {
      "paper_id": "ssi_listening-to-sounds-of-silence-for-speech-denoising",
      "slug": "listening-to-sounds-of-silence-for-speech-denoising",
      "title": "Listening to Sounds of Silence for Speech Denoising",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ruilin Xu",
        "Rundi Wu",
        "Yuko Ishiwaka",
        "Carl Vondrick",
        "Changxi Zheng"
      ],
      "url": "https://nao-ki-mura.com/paper/listening-to-sounds-of-silence-for-speech-denoising",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2010.12013",
      "arxiv_url": "https://arxiv.org/abs/2010.12013",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching",
      "slug": "discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching",
      "title": "Discriminative Sounding Objects Localization via Self-supervised Audiovisual Matching",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Di Hu",
        "Rui Qian",
        "Minyue Jiang",
        "Xiao Tan",
        "Shilei Wen",
        "Errui Ding",
        "Weiyao Lin",
        "Dejing Dou"
      ],
      "url": "https://nao-ki-mura.com/paper/discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2010.05466",
      "arxiv_url": "https://arxiv.org/abs/2010.05466",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:multimodal",
        "output:labels"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_digital-voicing-of-silent-speech",
      "slug": "digital-voicing-of-silent-speech",
      "title": "Digital Voicing of Silent Speech",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/digital-voicing-of-silent-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2010.02960",
      "arxiv_url": "https://arxiv.org/abs/2010.02960",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:throat",
        "modality:emg",
        "task:speech-reconstruction",
        "output:speech-audio",
        "deployment:speaker-dependent",
        "deployment:wearable",
        "evaluation:quantitative",
        "evaluation:unseen-words"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_end-to-end-speaker-dependent-voice-activity-detection",
      "slug": "end-to-end-speaker-dependent-voice-activity-detection",
      "title": "End-to-End Speaker-Dependent Voice Activity Detection",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/end-to-end-speaker-dependent-voice-activity-detection",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2009.09906",
      "arxiv_url": "https://arxiv.org/abs/2009.09906",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:labels",
        "task:audio-classification",
        "deployment:real-time",
        "deployment:speaker-dependent",
        "evaluation:quantitative"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception",
      "slug": "a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception",
      "title": "A comparison of oscillatory characteristics in covert speech and speech perception",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2009.02816",
      "arxiv_url": "https://arxiv.org/abs/2009.02816",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "modality:eeg",
        "output:labels",
        "evaluation:quantitative"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_silent-speech-interfaces-for-speech-restoration-a-review",
      "slug": "silent-speech-interfaces-for-speech-restoration-a-review",
      "title": "Silent Speech Interfaces for Speech Restoration: A Review",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Jose A. Gonzalez-Lopez",
        "Alejandro Gomez-Alanis",
        "Juan M. Martin-Donas",
        "Jose L. Perez-Cordoba",
        "Angel M. Gomez"
      ],
      "url": "https://nao-ki-mura.com/paper/silent-speech-interfaces-for-speech-restoration-a-review",
      "doi": "10.1109/ACCESS.2020.3026579",
      "doi_url": "https://doi.org/10.1109/ACCESS.2020.3026579",
      "arxiv_id": "2009.02110",
      "arxiv_url": "https://arxiv.org/abs/2009.02110",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:multimodal",
        "task:survey"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation",
      "slug": "an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation",
      "title": "An Overview of Deep-Learning-Based Audio-Visual Speech Enhancement and Separation",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2008.09586",
      "arxiv_url": "https://arxiv.org/abs/2008.09586",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:acoustic",
        "modality:microphone",
        "modality:multimodal",
        "modality:video",
        "task:survey",
        "output:speech-audio"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_citisen-a-deep-learning-based-speech-signal-processing-mobile-application",
      "slug": "citisen-a-deep-learning-based-speech-signal-processing-mobile-application",
      "title": "CITISEN: A Deep Learning-Based Speech Signal-Processing Mobile Application",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/citisen-a-deep-learning-based-speech-signal-processing-mobile-application",
      "doi": "10.1109/ACCESS.2022.3153469",
      "doi_url": "https://doi.org/10.1109/ACCESS.2022.3153469",
      "arxiv_id": "2008.09264",
      "arxiv_url": "https://arxiv.org/abs/2008.09264",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "modality:microphone",
        "task:speech-enhancement",
        "output:speech-audio",
        "deployment:mobile-suitable",
        "evaluation:quantitative"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_foley-music-learning-to-generate-music-from-videos",
      "slug": "foley-music-learning-to-generate-music-from-videos",
      "title": "Foley Music: Learning to Generate Music from Videos",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/foley-music-learning-to-generate-music-from-videos",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2007.10984",
      "arxiv_url": "https://arxiv.org/abs/2007.10984",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:multimodal",
        "modality:video",
        "output:audio",
        "evaluation:quantitative"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_learning-frame-level-attention-for-environmental-sound-classification",
      "slug": "learning-frame-level-attention-for-environmental-sound-classification",
      "title": "Learning Frame Level Attention for Environmental Sound Classification",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zhichao Zhang",
        "Shugong Xu",
        "Shunqing Zhang",
        "Tianhao Qiao",
        "Shan Cao"
      ],
      "url": "https://nao-ki-mura.com/paper/learning-frame-level-attention-for-environmental-sound-classification",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2007.07241",
      "arxiv_url": "https://arxiv.org/abs/2007.07241",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:labels",
        "task:audio-classification",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images",
      "slug": "ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images",
      "title": "Ultra2Speech -- A Deep Learning Framework for Formant Frequency Estimation and Tracking from Ultrasound Tongue Images",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Pramit Saha",
        "Yadong Liu",
        "Bryan Gick",
        "Sidney Fels"
      ],
      "url": "https://nao-ki-mura.com/paper/ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2006.16367",
      "arxiv_url": "https://arxiv.org/abs/2006.16367",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:oral-cavity",
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task",
      "slug": "application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task",
      "title": "Application of Just-Noticeable Difference in Quality as Environment Suitability Test for Crowdsourcing Speech Quality Assessment Task",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task",
      "doi": "10.1109/QoMEX48832.2020.9123093",
      "doi_url": "https://doi.org/10.1109/QoMEX48832.2020.9123093",
      "arxiv_id": "2004.05502",
      "arxiv_url": "https://arxiv.org/abs/2004.05502",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "output:labels",
        "evaluation:quantitative"
      ],
      "evidence_count": 3
    },
    {
      "paper_id": "ssi_vocoder-based-speech-synthesis-from-silent-videos",
      "slug": "vocoder-based-speech-synthesis-from-silent-videos",
      "title": "Vocoder-Based Speech Synthesis from Silent Videos",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Daniel Michelsanti",
        "Olga Slizovskaia",
        "Gloria Haro",
        "Emilia Gomez",
        "Zheng-Hua Tan",
        "Jesper Jensen"
      ],
      "url": "https://nao-ki-mura.com/paper/vocoder-based-speech-synthesis-from-silent-videos",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2004.02541",
      "arxiv_url": "https://arxiv.org/abs/2004.02541",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "deployment:real-time",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 10
    },
    {
      "paper_id": "ssi_continuous-silent-speech-recognition-using-eeg",
      "slug": "continuous-silent-speech-recognition-using-eeg",
      "title": "Continuous Silent Speech Recognition using EEG",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Gautam Krishna",
        "Co Tran",
        "Mason Carnahan",
        "Ahmed H Tewfik"
      ],
      "url": "https://nao-ki-mura.com/paper/continuous-silent-speech-recognition-using-eeg",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2002.03851",
      "arxiv_url": "https://arxiv.org/abs/2002.03851",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "deployment:hands-free",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "modality:eeg",
        "output:text",
        "task:speech-recognition"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings",
      "slug": "brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings",
      "title": "Brain2Char: A Deep Architecture for Decoding Text from Brain Recordings",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Pengfei Sun",
        "Gopala K. Anumanchipalli",
        "Edward F. Chang"
      ],
      "url": "https://nao-ki-mura.com/paper/brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1909.01401",
      "arxiv_url": "https://arxiv.org/abs/1909.01401",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "output:text",
        "task:speech-recognition"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed",
      "slug": "demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed",
      "title": "Demucs: Deep Extractor for Music Sources with extra unlabeled data remixed",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Alexandr Defossez",
        "Nicola Usunier",
        "Léon Bottou",
        "Francis Bach"
      ],
      "url": "https://nao-ki-mura.com/paper/demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1909.01174",
      "arxiv_url": "https://arxiv.org/abs/1909.01174",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "evaluation:structured-benchmark"
      ],
      "evidence_count": 7
    },
    {
      "paper_id": "ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification",
      "slug": "attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification",
      "title": "Attention based Convolutional Recurrent Neural Network for Environmental Sound Classification",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zhichao Zhang",
        "Shugong Xu",
        "Tianhao Qiao",
        "Shunq Zhang"
      ],
      "url": "https://nao-ki-mura.com/paper/attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1907.02230",
      "arxiv_url": "https://arxiv.org/abs/1907.02230",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "evaluation:structured-benchmark",
        "output:labels",
        "task:audio-classification"
      ],
      "evidence_count": 5
    },
    {
      "paper_id": "ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading",
      "slug": "lipper-synthesizing-thy-speech-using-multi-view-lipreading",
      "title": "Lipper: Synthesizing Thy Speech using Multi-View Lipreading",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yaman Kumar",
        "Rohit Jain",
        "Khwaja Mohd. Salik",
        "Rajiv Ratn Shah",
        "Yifang Yin",
        "Roger Zimmermann"
      ],
      "url": "https://nao-ki-mura.com/paper/lipper-synthesizing-thy-speech-using-multi-view-lipreading",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1907.01367",
      "arxiv_url": "https://arxiv.org/abs/1907.01367",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "deployment:real-time"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder",
      "slug": "ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder",
      "title": "Ultrasound-based Silent Speech Interface Built on a Continuous Vocoder",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Moham Salah Alradhi",
        "Géza Németh",
        "Gábor Gosztolya",
        "Tamás Gábor Csapó",
        "László Tóth",
        "Alexandra Markó"
      ],
      "url": "https://nao-ki-mura.com/paper/ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1906.09885",
      "arxiv_url": "https://arxiv.org/abs/1906.09885",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "evaluation:structured-benchmark",
        "task:speech-reconstruction"
      ],
      "evidence_count": 7
    },
    {
      "paper_id": "ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks",
      "slug": "video-driven-speech-reconstruction-using-generative-adversarial-networks",
      "title": "Video-Driven Speech Reconstruction using Generative Adversarial Networks",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Konstantinos Vougioukas",
        "Pingchuan Ma",
        "Stavros Petridis",
        "Maja Pantic"
      ],
      "url": "https://nao-ki-mura.com/paper/video-driven-speech-reconstruction-using-generative-adversarial-networks",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1906.06301",
      "arxiv_url": "https://arxiv.org/abs/1906.06301",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method",
      "slug": "a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method",
      "title": "A Novel Task-Oriented Text Corpus in Silent Speech Recognition and its Natural Language Generation Construction Method",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Dong Cao",
        "Dongdong Zhang",
        "HaiBo Chen"
      ],
      "url": "https://nao-ki-mura.com/paper/a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1905.01974",
      "arxiv_url": "https://arxiv.org/abs/1905.01974",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "modality:eeg",
        "output:text",
        "task:dataset"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces",
      "slug": "autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces",
      "title": "Autoencoder-Based Articulatory-to-Acoustic Mapping for Ultrasound Silent Speech Interfaces",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Gábor Gosztolya",
        "MTA-SZTE Research Group on Artificial Intelligence",
        "University of Szeged",
        "Hungary",
        "Gábor Infusz",
        "ginfusz@inf.u-szeged.hu",
        "Ádám Pintér",
        "Institute of Informatics",
        "University of Szeged",
        "Szeged",
        "Hungary",
        "László Tóth",
        "Institute of Informatics",
        "University of Szeged",
        "Szeged",
        "Hungary",
        "Alexandra Markó",
        "Department of Phonetics",
        "Eötvös Loránd University",
        "MTA-ELTE Lendület Lingual Articulation Research Group",
        "Budapest",
        "Hungary",
        "Gábor Csapó",
        "Department of Telecommunications and Media Informatics",
        "Budapest University of Technology and Economics",
        "MTA-ELTE Lendület Lingual Articulation Research Group",
        "Budapest",
        "Hungary"
      ],
      "url": "https://nao-ki-mura.com/paper/autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1904.05259",
      "arxiv_url": "https://arxiv.org/abs/1904.05259",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction",
      "slug": "denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction",
      "title": "Denoising convolutional autoencoder based B-mode ultrasound tongue image feature extraction",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Bo Li",
        "Kele Xu",
        "Dawei Feng",
        "Haibo Mi",
        "Huaimin Wang",
        "Jian Zhu"
      ],
      "url": "https://nao-ki-mura.com/paper/denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1903.00888",
      "arxiv_url": "https://arxiv.org/abs/1903.00888",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "evaluation:walking-tested",
        "modality:ultrasound",
        "output:text",
        "task:speech-reconstruction"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis",
      "slug": "all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis",
      "title": "All-neural online source separation, counting, and diarization for meeting analysis",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Thilo von Neumann",
        "Keisuke Kinoshita",
        "Marc Delcroix",
        "Shoko Araki",
        "Tomohiro Nakatani",
        "Reinhold Haeb-Umbach"
      ],
      "url": "https://nao-ki-mura.com/paper/all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1902.07881",
      "arxiv_url": "https://arxiv.org/abs/1902.07881",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:speech-audio"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "kimura2019_sottovoce",
      "slug": "sottovoce",
      "title": "SottoVoce: An Ultrasound Imaging-Based Silent Speech Interaction Using Deep Neural Networks",
      "year": 2019,
      "venue": "CHI '19",
      "authors": [
        "Naoki Kimura",
        "Michinari Kono",
        "Jun Rekimoto"
      ],
      "url": "https://nao-ki-mura.com/paper/sottovoce",
      "doi": "10.1145/3290605.3300376",
      "doi_url": "https://doi.org/10.1145/3290605.3300376",
      "arxiv_id": "",
      "arxiv_url": "",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+existing_expert_seed",
      "source_coverage": "high",
      "tags": [
        "body_site:jaw",
        "body_site:oral-cavity",
        "deployment:hands-free",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 7
    },
    {
      "paper_id": "ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold",
      "slug": "audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold",
      "title": "Audio Spectrogram Factorization for Classification of Telephony Signals below the Auditory Threshold",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Iroro Orife",
        "Shane Walker",
        "Jason Flaks"
      ],
      "url": "https://nao-ki-mura.com/paper/audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1811.04139",
      "arxiv_url": "https://arxiv.org/abs/1811.04139",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "deployment:real-time",
        "evaluation:quantitative",
        "modality:acoustic",
        "output:labels",
        "task:audio-classification"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition",
      "slug": "proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition",
      "title": "Proactive Security: Embedded AI Solution for Violent and Abusive Speech Recognition",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Christoph Dane",
        "Shulbi Bixbi Solutions",
        "Leonardo Pombal",
        "Samsung Research",
        "Vitor Jord˜ao",
        "Shulbi Bixbi Solutions",
        "Guilherme Zioll",
        "Samsung Research",
        "Bruno Martho",
        "Shulbi Bixbi Solutions",
        "Antˆonio Postal",
        "Samsung Research",
        "Thiago Prochnow",
        "Samsung Research"
      ],
      "url": "https://nao-ki-mura.com/paper/proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1810.09431",
      "arxiv_url": "https://arxiv.org/abs/1810.09431",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "modality:microphone",
        "evaluation:structured-benchmark",
        "output:text",
        "task:speech-recognition"
      ],
      "evidence_count": 8
    },
    {
      "paper_id": "ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed",
      "slug": "harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed",
      "title": "Harnessing AI for Speech Reconstruction using Multi-view Silent Video Feed",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yaman Kumar",
        "Mayank Aggarwal",
        "Pratham Nawal",
        "Shin’ichi Satoh",
        "Rajiv Ratn Shah",
        "Roger Zimmermann"
      ],
      "url": "https://nao-ki-mura.com/paper/harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed",
      "doi": "10.1145/3240508.3241911",
      "doi_url": "https://doi.org/10.1145/3240508.3241911",
      "arxiv_id": "1807.00619",
      "arxiv_url": "https://arxiv.org/abs/1807.00619",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "body_site:tongue",
        "modality:camera",
        "modality:video",
        "output:speech-audio",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "task:speech-reconstruction"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_visual-only-recognition-of-normal-whispered-and-silent-speech",
      "slug": "visual-only-recognition-of-normal-whispered-and-silent-speech",
      "title": "Visual-Only Recognition of Normal, Whispered and Silent Speech",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Stavro Petridis",
        "Jie Shen",
        "Doruk Cetin",
        "Maja Pantic"
      ],
      "url": "https://nao-ki-mura.com/paper/visual-only-recognition-of-normal-whispered-and-silent-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1802.06399",
      "arxiv_url": "https://arxiv.org/abs/1802.06399",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "modality:video",
        "output:text",
        "task:speech-recognition"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_cross-modal-embeddings-for-video-and-audio-retrieval",
      "slug": "cross-modal-embeddings-for-video-and-audio-retrieval",
      "title": "Cross-modal Embeddings for Video and Audio Retrieval",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Didac Suris",
        "Amanda Duarte",
        "Amaia Salvador",
        "Jordi Torres",
        "Xavier Giro-i-Nieto"
      ],
      "url": "https://nao-ki-mura.com/paper/cross-modal-embeddings-for-video-and-audio-retrieval",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1801.02200",
      "arxiv_url": "https://arxiv.org/abs/1801.02200",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "modality:acoustic",
        "modality:multimodal",
        "modality:video"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video",
      "slug": "lip2audspec-speech-reconstruction-from-silent-lip-movements-video",
      "title": "Lip2AudSpec: Speech reconstruction from silent lip movements video",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Hassan Akbari",
        "Himani Arora",
        "Liangliang Cao",
        "Nima Mesgarani"
      ],
      "url": "https://nao-ki-mura.com/paper/lip2audspec-speech-reconstruction-from-silent-lip-movements-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1710.09798",
      "arxiv_url": "https://arxiv.org/abs/1710.09798",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning",
      "slug": "updating-the-silent-speech-challenge-benchmark-with-deep-learning",
      "title": "Updating the silent speech challenge benchmark with deep learning",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yan Ji",
        "Licheng Liu",
        "Hongcui Wang",
        "Zhilei Liu",
        "Zhibin Niu",
        "Bruce Denby"
      ],
      "url": "https://nao-ki-mura.com/paper/updating-the-silent-speech-challenge-benchmark-with-deep-learning",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1709.06818",
      "arxiv_url": "https://arxiv.org/abs/1709.06818",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:tongue",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:ultrasound",
        "modality:video",
        "output:text",
        "task:speech-recognition"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement",
      "slug": "seeing-through-noise-visually-driven-speaker-separation-and-enhancement",
      "title": "Seeing Through Noise: Visually Driven Speaker Separation and Enhancement",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Aviv Gabbay",
        "Ariel Ephrat",
        "Tavi Halperin",
        "Shmuel Peleg"
      ],
      "url": "https://nao-ki-mura.com/paper/seeing-through-noise-visually-driven-speaker-separation-and-enhancement",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1708.06767",
      "arxiv_url": "https://arxiv.org/abs/1708.06767",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "modality:video",
        "output:speech-audio",
        "task:speech-enhancement"
      ],
      "evidence_count": 6
    },
    {
      "paper_id": "ssi_improved-speech-reconstruction-from-silent-video",
      "slug": "improved-speech-reconstruction-from-silent-video",
      "title": "Improved Speech Reconstruction from Silent Video",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ariel Ephrat",
        "Tavi Halperin",
        "Shmuel Peleg"
      ],
      "url": "https://nao-ki-mura.com/paper/improved-speech-reconstruction-from-silent-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1708.01204",
      "arxiv_url": "https://arxiv.org/abs/1708.01204",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "deployment:hands-free",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "evaluation:unseen-words",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 10
    },
    {
      "paper_id": "ssi_vid2speech-speech-reconstruction-from-silent-video",
      "slug": "vid2speech-speech-reconstruction-from-silent-video",
      "title": "Vid2speech: Speech Reconstruction from Silent Video",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ariel Ephrat",
        "Shmuel Peleg"
      ],
      "url": "https://nao-ki-mura.com/paper/vid2speech-speech-reconstruction-from-silent-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1701.00495",
      "arxiv_url": "https://arxiv.org/abs/1701.00495",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:unseen-words",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences",
      "slug": "contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences",
      "title": "Contour-based 3d tongue motion visualization using ultrasound image sequences",
      "year": 2016,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Kele Xu",
        "Yin Yang",
        "Clemence Leboullenger",
        "Pierre Roussel",
        "Bruce Denby"
      ],
      "url": "https://nao-ki-mura.com/paper/contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1605.05967",
      "arxiv_url": "https://arxiv.org/abs/1605.05967",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "evaluation:quantitative",
        "modality:ultrasound"
      ],
      "evidence_count": 4
    },
    {
      "paper_id": "ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint",
      "slug": "optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint",
      "title": "Optimal Power Control for Analog Bidirectional Relaying with Long-Term Relay Power Constraint",
      "year": 2014,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zoran Hadzi-Velkov",
        "Nikola Zlatanov",
        "Robert Schober"
      ],
      "url": "https://nao-ki-mura.com/paper/optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint",
      "doi": "10.1109/GLOCOM.2013.6831710",
      "doi_url": "https://doi.org/10.1109/GLOCOM.2013.6831710",
      "arxiv_id": "1404.0906",
      "arxiv_url": "https://arxiv.org/abs/1404.0906",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "medium-high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [],
      "evidence_count": 4
    }
  ],
  "reviews": [
    {
      "paper_id": "arxiv_2606-09667",
      "slug": "cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading",
      "title": "Cross-Modal Masking for Robust Silent Speech Synthesis Using sEMG and Lipreading",
      "year": 2026,
      "venue": "arXiv",
      "authors": [
        "Eder del Blanco",
        "David Gimeno-Gómez",
        "Eva Navas",
        "Carlos-D. Martínez-Hinarejos",
        "Inma Hernáez"
      ],
      "url": "https://nao-ki-mura.com/paper/cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2606.09667",
      "arxiv_url": "https://arxiv.org/abs/2606.09667",
      "review_state": "expert_fulltext_draft",
      "review_priority": "High",
      "review_confidence": "High confidence based on extensive full text analysis",
      "review_basis": "full_text+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:skin",
        "evaluation:quantitative",
        "modality:emg",
        "modality:video",
        "output:speech-audio",
        "output:text",
        "task:speech-recognition",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "The paper advances silent speech synthesis by leveraging masked training to robustly fuse electromyography and lipreading, showing improved performance and resilience, but adaptation to laryngectomized users remains challenging.",
      "expert_take_long": "This paper makes a significant contribution to silent speech interface research by proposing a masked multimodal speech synthesis system that robustly integrates surface electromyography and lipreading signals. The approach addresses prior gaps by improving performance in multi-speaker silent speech synthesis and introducing temporal adaptive masking during training to enhance robustness against sensor noise and modality degradation. The authors provide extensive experimental validation on a Spanish dataset including laryngectomized and laryngeal speakers, showing substantial reductions in word error rates and complementary modality contributions at the phonemic level. However, the study reveals challenges in adapting to post-laryngectomy speakers due to variability and speech production differences, highlighting an important future direction. Overall, this work advances the SSI field by demonstrating the benefits and necessity of robust multimodal fusion strategies and sets a foundation for practical silent speech restoration systems, though further progress is needed for clinical deployment and generalization.",
      "expert_true_value": "Confirms that masked multimodal integration of sEMG and lipreading improves silent speech synthesis robustness and accuracy, while emphasizing that clinical adaptation requires addressing speaker variability and articulation differences.",
      "canon_before": "Prior work explored unimodal silent speech interfaces using either sEMG or lipreading; multimodal combinations of sEMG and lipreading were limited to classification tasks and mostly in audible speech conditions; masking strategies used mainly in audio-visual speech recognition for enhancing robustness but not extensively for silent speech synthesis.",
      "delta_from_canon": "First use of masked multimodal training combining sEMG and lipreading for continuous silent speech synthesis with detailed evaluation showing improved WER and phoneme accuracy and robustness under modality degradation and sensor failure simulations.",
      "position_in_field": "Intermediate-advanced position; builds meaningfully on prior SSI methods by combining modalities under masked training and demonstrates robustness enhancements, but clinical deployment challenges remain.",
      "practical_value": "Improves silent speech synthesis robustness; offers insights for practical SSI designs; yet needs further clinical validation and deployment advances.",
      "axes_moved": "Robustness to modality degradation improved; multimodal integration effectiveness advanced; phone-level articulatory class-level understanding enhanced.",
      "axes_unresolved": "Generalization to unseen speakers beyond the dataset, full speaker-independent operation, efficacy for other languages remain unexplored.",
      "axes_regressed": "Adaptation to laryngectomized speakers shows performance challenges; unimodal masking degrades performance especially for sEMG.",
      "technical_limits": "Single speaker generalization and adaptation to alaryngeal speakers limited; real-time inference not demonstrated; masking only validated in specific degradation regimes.",
      "evaluation_limits": "Evaluation focuses on multi-speaker but limited vocabulary Spanish dataset; unseen words testing is limited; limited adaptation success for laryngectomized users; generalization to other languages or spontaneous speech not assessed.",
      "deployment_limits": "Current study does not evaluate real-time deployment, mobile suitability, or wearable system practicalities; robustness to speaker variability especially post-laryngectomy remains a challenge.",
      "scope_limits": "Limited to Spanish language sentence data; limited number of laryngectomized subjects; no spontaneous speech or large vocabulary continuous speech tested.",
      "task": "speech-reconstruction (silent speech synthesis)",
      "input_modality": "multimodal (sEMG and video lipreading)",
      "sensor_hardware": "Surface electromyography 8 bipolar sensors placed on face and neck; video from RGB camera of speaker's lips",
      "body_site": "face; lip",
      "output_type": "speech-audio (mel spectrograms) and phonetic labels",
      "vocabulary_type": "Phoneme-level labels, sentence-level utterances with limited vocabulary size not quantified extensively",
      "vocabulary_size": "Limited to about 30 phoneme classes plus silence token; test sentences phonemically balanced",
      "metrics": "Phone Accuracy and Word Error Rate (WER) from Whisper v3, Structural Similarity Index (SSIM) for spectrogram quality; exact metrics provided with confidence intervals.",
      "evaluation_mode": "Quantitative evaluation with monomodal baselines and multimodal models under controlled experimental conditions including masking ablations and bitrate/video degradation simulations.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "Main contributions: 1) Demonstrate sEMG and lipreading are complementary, fusion improves WER and phone accuracy; 2) critical role of temporal adaptive masking for robustness and generalization; 3) phone-level multimodal contributions with benefits for vowels and affricates.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "section_or_location": "Abstract, Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "Proposes masked multimodal speech synthesis framework integrating sEMG and lipreading with modality masking during training to improve robustness and performance under modality degradation and sensor failure conditions in continuous speech synthesis.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "section_or_location": "Introduction, Methods",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluation on multi-speaker Spanish ReSSInt dataset including audible and silent speech, with 8-channel sEMG and lip video, controlled studio conditions, with data splits ensuring text independence. Includes laryngeal and laryngectomized subjects for silent speech synthesis evaluation.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "section_or_location": "Section IV",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Performance reported via Word Error Rate (WER), Phone Accuracy, Structural Similarity Index Measure (SSIM) for spectral reconstruction; WER improvements up to 14 absolute points over strongest unimodal baseline.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "section_or_location": "Section V",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Adaptation to laryngectomized speakers remains an open challenge due to articulatory variability and lack of paired audible speech; multimodal fusion less beneficial for such speakers.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "section_or_location": "Section V",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.8,
          "statement": "Robust multimodal fusion with masking promotes resilience to sensor noise, missing or degraded modality data, advancing towards real-world deployments, but real-time and mobile suitability not demonstrated yet.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-09667-cross-modal-masking-for-robust-silent-speech-synthesis-using-semg-and-lipreading.txt",
          "section_or_location": "Abstract, Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "arxiv_2606-01264",
      "slug": "a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production",
      "title": "A 1000-hour EEG-EMG-audio dataset of Japanese speech production",
      "year": 2026,
      "venue": "arXiv",
      "authors": [
        "Motoshige Sato",
        "Ilya Horiguchi",
        "Masakazu Inoue",
        "Kenichi Tomeoka",
        "Eri Hatakeyama",
        "Yuya Kita",
        "Atsushi Yamamoto",
        "Ippei Fujisawa",
        "Shuntaro Sasai"
      ],
      "url": "https://nao-ki-mura.com/paper/a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2606.01264",
      "arxiv_url": "https://arxiv.org/abs/2606.01264",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "body_site:skin",
        "evaluation:quantitative",
        "modality:eeg",
        "modality:emg"
      ],
      "expert_take_short": "A 1020-hour multimodal EEG-EMG-audio dataset for Japanese overt speech vastly expands data resources, enabling diverse speech decoding and EEG research, though generalization is limited by three participants and no decoding benchmarks are presented.",
      "expert_take_long": "This paper releases the JapanEEG dataset, a large-scale multimodal collection of 1020 hours of scalp EEG, facial EMG, and audio recorded during open-vocabulary overt Japanese speech from three participants using three different EEG systems. The dataset substantially exceeds prior public speech EEG datasets in scale and diversity of recording hardware. Rigorous technical validation demonstrates expected physiological EEG spectral profiles and event-related potentials across devices and participants, confirming high data quality. The work's principal contribution is the dataset itself, positioned to enable diverse speech decoding studies and to support broader EEG research on artifact modeling, representation learning, and cross-session/device adaptation. Limitations include a small participant pool limiting generalization and the lack of downstream decoding benchmarks. Overall, this dataset fills a notable gap in publicly available speech EEG resources, particularly for overt speech in Japanese, and offers valuable opportunities for the SSI community to develop and evaluate decoding approaches with large longitudinal multimodal EEG data.",
      "expert_true_value": "A large multimodal dataset of high-quality EEG, EMG, and audio for overt Japanese speech with strong technical validation, advancing data availability for non-invasive speech decoding research.",
      "canon_before": "Prior public EEG datasets for speech decoding were smaller in scale (hours from single- or few-channel systems), often limited to imagined speech or single devices, and predominantly in languages other than Japanese.",
      "delta_from_canon": "Significantly larger scale and multimodal dataset with cross-device, longitudinal recordings in Japanese overt speech, supporting broader research areas beyond speech decoding alone.",
      "position_in_field": "significant dataset contribution expanding scale and modality diversity for speech EEG decoding in Japanese overt speech",
      "practical_value": "High practical value for research in EEG-based speech decoding, artifact modeling, and cross-device studies; direct application to deployed systems not presented.",
      "axes_moved": "scale of available EEG speech datasets dramatically increased with simultaneous multimodal recording and multiple EEG systems across many sessions.",
      "axes_unresolved": "Generalization to larger subject populations is unresolved due to only three participants.",
      "axes_regressed": "No regression in the axes of modality or evaluation scope was identified from the text.",
      "technical_limits": "Small sample size (n=3) limits generalization; lack of downstream decoding evaluations; longitudinal recordings from few participants only; no testing of wearable or mobile suitability reported.",
      "evaluation_limits": "Evaluation is limited to basic signal quality and physiological validation analyses, not downstream decoding performance or benchmarking.",
      "deployment_limits": "The dataset itself is not a deployed system; no deployment readiness or specific application deployment is described.",
      "scope_limits": "Data collected only from three Japanese male participants limits participant diversity; evaluation limited to signal quality checks without decoding experiments.",
      "task": "dataset",
      "input_modality": "multimodal",
      "sensor_hardware": "g.Pangolin (128 ch), g.SCARABEO (62 ch), eego™sports (63 ch); scalp EEG caps with up to 128 channels plus facial EMG electrodes (3 channels) configured around lips and eyes; lavalier microphone for audio recording.",
      "body_site": "brain",
      "output_type": "audio",
      "vocabulary_type": "open vocabulary",
      "vocabulary_size": "open vocabulary",
      "metrics": "",
      "evaluation_mode": "quantitative",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We present a multimodal dataset of 1020 hours of simultaneously recorded scalp electroencephalography (EEG), facial electromyography (EMG), and speech audio from three healthy native Japanese speakers during open-vocabulary overt speech.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Three scalp EEG systems were employed across the dataset: g.Pangolin (128 channels), g.SCARABEO (62 channels), and eego™sports (63 channels). Facial EMG was recorded simultaneously with three bipolar channels placed on the upper lip, lower lip, and eye regions.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "section_or_location": "Methods section",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The dataset is publicly available via OpenNeuro in Brain Imaging Data Structure (BIDS) format under a CC0 waiver with approximately 955 GB in size.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "section_or_location": "Data Records",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Technical validation comprised power spectral density and event-related potential analyses across participants, devices, and tasks, showing the expected 1/f spectral profile, task-related alpha-band attenuation, and time-locked evoked responses consistent with speech-related cortical activity.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "section_or_location": "Technical Validation section",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The dataset involves only three participants, limiting generalizability across larger populations.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2606-01264-a-1000-hour-eeg-emg-audio-dataset-of-japanese-speech-production.txt",
          "section_or_location": "Methods section",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "arxiv_2605-08075",
      "slug": "zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping",
      "title": "Zero-Shot Imagined Speech Decoding via Imagined-to-Listened MEG Mapping",
      "year": 2026,
      "venue": "arXiv",
      "authors": [
        "Maryam Maghsoudi",
        "Shihab Shamma"
      ],
      "url": "https://nao-ki-mura.com/paper/zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2605.08075",
      "arxiv_url": "https://arxiv.org/abs/2605.08075",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "modality:magnetic",
        "output:text",
        "task:speech-recognition"
      ],
      "expert_take_short": "The study convincingly shows zero-shot imagined speech decoding by mapping MEG imagery to listened responses and decoding with a listened-trained contrastive model, marking a promising data-efficient advance despite limited vocabulary and hardware constraints.",
      "expert_take_long": "This paper presents an innovative approach to imagined speech decoding by learning to map imagined MEG signals to listened MEG responses and leveraging a contrastive decoder trained only on listened data. The approach addresses longstanding challenges of scarce, noisy imagined speech datasets and timing variability by using rhythmic stimuli and trained musicians. The method generalizes well across unseen subjects and mapping architectures, although absolute decoding performance is substantially below listened speech ceilings due to noise and limited vocabulary size. The findings underscore the largely linear nature of the mapping and highlight data quantity as a key bottleneck, with the added insight that complex models like transformers currently underperform due to data scarcity. While hardware and dataset limitations temper deployment prospects, this zero-shot decoding pipeline provides a clear, scalable direction for practical imagined speech BCIs by exploiting richer listened speech neural representations.",
      "expert_true_value": "This is a proof-of-concept demonstration that zero-shot imagined speech decoding is achievable non-invasively by leveraging mappings from imagined to listened MEG and decoders trained only on listened data, highlighting a scalable path forward grounded in leveraging better characterized listened speech signals.",
      "canon_before": "Previous work mainly decoded imagined speech from noisy, limited datasets typically using within-subject training and small vocabularies, struggling with timing variability and low signal-to-noise ratio in imagined MEG or EEG; meanwhile, decoding listened speech had achieved higher accuracy with larger datasets and advanced decoders.",
      "delta_from_canon": "Introduces a novel three-stage pipeline that maps imagined MEG to listened MEG, enabling exploitation of reliable listened speech decoding models for zero-shot decoding of imagined speech without imagined training labels, validated on held-out subjects.",
      "position_in_field": "innovative; advances zero-shot imagined speech decoding leveraging listened speech decoders and cross-condition mappings",
      "practical_value": "Currently limited by hardware and dataset size; presents a methodological advance and proof-of-concept rather than a ready BCI system; practical value expected to grow with dataset and model scaling.",
      "axes_moved": "The paper advances cross-subject imagined speech decoding by learning cross-condition mappings and demonstrates viability of zero-shot decoding using listened speech data and representations.",
      "axes_unresolved": "Generalization to larger vocabularies and real-world continuous imagined speech remains unresolved; the effect of mobility or naturalistic conditions was not addressed.",
      "axes_regressed": "The work is limited in vocabulary size (76 words), decoding performance remains substantially below listened speech decoding ceiling, and fine-grained segment discrimination remains challenging.",
      "technical_limits": "Limited vocabulary of 76 words; signal-to-noise ratio in imagined MEG remains low; transformer models underperform due to dataset size; MEG instrumentation limits portability and deployment.",
      "evaluation_limits": "Evaluations are based on 17 subjects with rhythmic stimuli and limited vocabulary size; results may not generalize to more complex real-world imagined speech; transformer models under current dataset size underperform, suggesting data limits.",
      "deployment_limits": "The system requires MEG hardware which is non-portable and expensive, dataset size and training requirements preclude real-time use today, and vocabulary size and noise limit practical deployment; further dataset scaling and hardware development needed.",
      "scope_limits": "Limited to rhythmic stimuli from trained musicians and small vocabulary decoded from MEG; real-world applicability and continuous natural imagined speech decoding remain open challenges.",
      "task": "speech-recognition",
      "input_modality": "magnetic",
      "sensor_hardware": "157-channel whole-head MEG system (axial gradiometers)",
      "body_site": "brain",
      "output_type": "text",
      "vocabulary_type": "content-words extracted from poems",
      "vocabulary_size": "76",
      "metrics": "Mean Pearson correlation between predicted and actual listened MEG signals; rank-based decoding metrics including Recall@k for word decoding (e.g., Recall@1 up to ~9.1% for combined embeddings); above-chance p-values reported for mapping and decoding.",
      "evaluation_mode": "Quantitative experimental evaluation using cross-subject leave-one-subject-out generalization; analyses incorporate null baseline comparison; evaluation metrics include Pearson correlation and rank-based word decoding.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The authors claim to contribute (1) a paired imagined-listened MEG dataset with rhythmic continuous stimuli from trained musicians for better alignment, (2) evaluation of six mapping models to transform imagined MEG into listened MEG with generalization to unseen subjects, (3) evidence that predicted listened signals preserve stimulus-specific information, (4) a contrastive decoder trained on listened data with multiple word embedding strategies, and (5) a proof-of-concept full pipeline for zero-shot imagined speech decoding without imagined labels.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "section_or_location": "Abstract, Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The approach is novel in using learned mappings to convert imagined brain responses into listened counterparts to utilize richer, more reliable listened data decoders, which prior works did not apply for zero-shot imagined speech decoding.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "section_or_location": "Abstract, Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The study uses a novel MEG dataset recorded from 17 trained musicians exposed to four rhythmic stimuli (two melodies and two poems) with paired imagined and listened conditions, enabling precise temporal alignment; vocabulary size is 76 unique content words from the poems.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "section_or_location": "Methods",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "All six mapping architectures achieve prediction correlations significantly above null on training data and generalize significantly above null to unseen subjects, demonstrating the learned mappings are not subject-specific and transfer to new individuals.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "section_or_location": "Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Metrics include mean per-channel Pearson correlation between predicted and target listened MEG signals, and rank-based decoding performance such as Recall@1 reaching approximately 9.1% for the combined BERT + Wav2Vec2 embeddings on listened speech decoding.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "section_or_location": "Methods and Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Decoding performance is substantially below the listened speech decoding ceiling, limited vocabulary size (76 words), and fine-grained segment discrimination remains challenging; mapping correlation values are small, introducing noise before decoding.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "section_or_location": "Results and Discussion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "The system requires MEG hardware which is non-portable and expensive, dataset size and training requirements preclude real-time use today, and vocabulary size and noise limit practical deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "section_or_location": "Discussion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper presents a proof-of-concept full pipeline for zero-shot imagined speech decoding from imagined MEG using a learned mapping to listened MEG and a contrastive decoder trained only on listened data, with no imagined speech labels used during training.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2605-08075-zero-shot-imagined-speech-decoding-via-imagined-to-listened-meg-mapping.txt",
          "section_or_location": "Abstract, Methods, Results, Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "rekimoto2026_nasovoce",
      "slug": "nasovoce",
      "title": "NasoVoce: A Nose-Mounted Low-Audibility Speech Interface for Always-Available Speech Interaction",
      "year": 2026,
      "venue": "CHI '26 / arXiv",
      "authors": [
        "Jun Rekimoto",
        "Yu Nishimura",
        "Bojian Yang"
      ],
      "url": "https://nao-ki-mura.com/paper/nasovoce",
      "doi": "10.1145/3772318.3791397",
      "doi_url": "https://doi.org/10.1145/3772318.3791397",
      "arxiv_id": "2603.10324",
      "arxiv_url": "https://arxiv.org/abs/2603.10324",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+existing_expert_seed",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:nose",
        "body_site:oral-cavity",
        "deployment:hands-free",
        "deployment:mobile-suitable",
        "deployment:wearable",
        "evaluation:quantitative",
        "modality:acoustic",
        "modality:microphone",
        "modality:multimodal",
        "modality:vibration",
        "output:speech-audio"
      ],
      "expert_take_short": "A strong deployment-focused speech interface leveraging a novel nose-pad dual-sensor configuration and multimodal fusion to enable robust low-audibility speech interaction with AI under noise, backed by extensive evaluation.",
      "expert_take_long": "NasoVoce represents a serious, well-constructed system integrating a discreet smart-glasses nose-pad form factor housing a MEMS microphone and vibration sensor. By fusing these complementary modalities via a novel dual-input D-DCCRN enhancement model trained with Whisper-based knowledge distillation, it robustly captures low-volume and whispered speech. The extensive dataset of 104 hours from 45 participants, rigorous evaluation covering ASR metrics, objective speech quality evaluation, a large-scale subjective study, and real-world qualitative trials establish its practical feasibility. While the core method is validated, deployment aspects such as continuous streaming, adaptive sensor fusion depending on context, smartphone integration, and accommodation for physiological variability like nasal patency require further study and development.",
      "expert_true_value": "Demonstrates a feasible, socially acceptable, and noise-robust wearable speech interface design embracing discreet whispered speech input complemented by a novel hardware sensor fusion architecture and rigorous multi-modal enhancement and ASR evaluation, moving beyond small-vocabulary silent speech towards open-vocabulary AI conversation.",
      "canon_before": "Wearable silent speech and whispered speech interfaces typically struggled to balance wearability, vocabulary size, noise robustness, and social acceptability, often limited to small command sets or requiring obtrusive sensors.",
      "delta_from_canon": "NasoVoce innovates by mounting a MEMS microphone and vibration sensor at the smart glasses nose pad, capturing complementary air- and skin-conducted signals for robust low-volume and whispered speech capture, combined with a dual-input enhancement model.",
      "position_in_field": "A strong modern SSI-adjacent wearable speech paper that refocuses from fully silent recognition to discreet, robust AI voice interaction with open vocabulary.",
      "practical_value": "High for wearable AI voice agents by addressing sensor placement, noise robustness, perceptual quality, and practical evaluation in diverse contexts.",
      "axes_moved": "wearability; discreet_speech; noise_robust_capture; ai_voice_agent_alignment; multimodal_fusion",
      "axes_unresolved": "continuous_streaming; adaptive_sensor_gating; longitudinal_wearability; physiological_variability",
      "axes_regressed": "",
      "technical_limits": "Fusion model not fully streaming; whispered vibration signals remain weak limiting enhancement quality; performance under extreme noise favors vibration sensor input only at very low SNR.",
      "evaluation_limits": "Evaluations used synthetic noise corruption for ASR and objective metrics; in-the-wild testing was qualitative with limited environments; no unseen words generalization tested.",
      "deployment_limits": "Fully streaming continuous operation, smartphone integration, adaptive sensor gating based on SNR, and calibration for physiological factors such as nasal patency remain future work.",
      "scope_limits": "Targets low-audibility whispered speech, not fully silent speech without any acoustic leakage; assumes hand-covering mouth for privacy.",
      "task": "speech-enhancement; speech-recognition for whispered and low-audibility speech to support always-available AI voice interaction",
      "input_modality": "acoustic; vibration; multimodal",
      "sensor_hardware": "MEMS microphone (Syntiant SPH0141LM4H-1) and MEMS vibration sensor (Syntiant V2S200D) integrated in smart glasses nose pads providing synchronized PDM output.",
      "body_site": "face; nose; oral-cavity",
      "output_type": "speech-audio",
      "vocabulary_type": "Open-vocabulary speech",
      "vocabulary_size": "Open vocabulary via downstream Whisper ASR",
      "metrics": "Recognition accuracy expressed in word error rate (WER) and character error rate (CER) under varying noise conditions; PESQ and STOI perceptual audio quality; MUSHRA subjective audio quality scores.",
      "evaluation_mode": "Quantitative ASR accuracy (WER, CER) on held-out data, objective perceptual quality metrics (PESQ, STOI), MUSHRA subjective ratings with 50 evaluators, and qualitative in-the-wild recordings in four real-world environments.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "NasoVoce is a smart-glasses nose-pad interface integrating a MEMS microphone and vibration sensor to capture low-audibility whispered speech robustly under environmental noise.",
          "evidence_source": "inputs/ssi_fulltext/text/rekimoto2026_nasovoce-nasovoce-a-nose-mounted-low-audibility-speech-interface-for-always-available-spe.txt",
          "source_ref": "inputs/ssi_fulltext/text/rekimoto2026_nasovoce-nasovoce-a-nose-mounted-low-audibility-speech-interface-for-always-available-spe.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The main sensing innovation is the nose-pad location combining airborne and skin-conducted sensing with a MEMS microphone and MEMS vibration sensor, mounted on smart glasses for discreet always-available speech interaction.",
          "evidence_source": "inputs/ssi_fulltext/text/rekimoto2026_nasovoce-nasovoce-a-nose-mounted-low-audibility-speech-interface-for-always-available-spe.txt",
          "source_ref": "inputs/ssi_fulltext/text/rekimoto2026_nasovoce-nasovoce-a-nose-mounted-low-audibility-speech-interface-for-always-available-spe.txt",
          "section_or_location": "3 NasoVoce",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "ASR recognition accuracy measured in word error rate (WER) and character error rate (CER) using Whisper Large-v2 with 1,000 held-out items under synthetic noise ranging from -10 dB to 10 dB; objective perceptual speech quality evaluated with PESQ and STOI metrics; subjective MUSHRA ratings with 50 participants under different noise conditions.",
          "evidence_source": "inputs/ssi_fulltext/text/rekimoto2026_nasovoce-nasovoce-a-nose-mounted-low-audibility-speech-interface-for-always-available-spe.txt",
          "source_ref": "inputs/ssi_fulltext/text/rekimoto2026_nasovoce-nasovoce-a-nose-mounted-low-audibility-speech-interface-for-always-available-spe.txt",
          "section_or_location": "5 Evaluation",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Limitations include not yet achieving fully streaming speech processing integration, fusion performance degradation under extreme noise favoring vibration-only input, and the need for calibration to physiological variability such as nasal patency.",
          "evidence_source": "inputs/ssi_fulltext/text/rekimoto2026_nasovoce-nasovoce-a-nose-mounted-low-audibility-speech-interface-for-always-available-spe.txt",
          "source_ref": "inputs/ssi_fulltext/text/rekimoto2026_nasovoce-nasovoce-a-nose-mounted-low-audibility-speech-interface-for-always-available-spe.txt",
          "section_or_location": "6 Discussions",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "arxiv_2511-21740",
      "slug": "a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding",
      "title": "A cross-species neural foundation model for end-to-end speech decoding",
      "year": 2025,
      "venue": "arXiv",
      "authors": [
        "Yizi Zhang",
        "Linyang He",
        "Chaofei Fan",
        "Tingkai Liu",
        "Han Yu",
        "Trung Le",
        "Jingyuan Li",
        "Scott Linderman",
        "Lea Duncker",
        "Francis R Willett",
        "Nima Mesgarani",
        "Liam Paninski"
      ],
      "url": "https://nao-ki-mura.com/paper/a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2511.21740",
      "arxiv_url": "https://arxiv.org/abs/2511.21740",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "output:text",
        "task:speech-recognition"
      ],
      "expert_take_short": "Introduces a cross-species pretrained transformer encoder enabling state-of-the-art end-to-end neural speech decoding with audio-LLMs, improving accuracy and enabling imagined speech decoding, but latency and real-time deployment remain challenges.",
      "expert_take_long": "This paper presents a significant step forward in speech brain-computer interfaces by integrating a transformer-based neural encoder pretrained via self-supervised learning on extensive human and monkey neural datasets with an audio-LLM decoder for end-to-end neural-to-text speech decoding. The approach achieves state-of-the-art word error rates on challenging Brain-to-Text ’24 and ’25 benchmarks, substantially improving upon prior RNN-based cascaded and end-to-end models. Importantly, the model successfully decodes both attempted and imagined speech, exhibiting cross-task representational alignment. However, practical deployment is currently limited by slower inference speeds for the end-to-end approach and bidirectional attention's unsuitability for real-time decoding. Further improvements in LLM decoder design, modality alignment, and handling neural signal variability and plasticity are required for long-term, real-world application. Overall, the approach offers a promising foundation for future scalable, integrated neuroprosthetic speech decoding systems.",
      "expert_true_value": "Establishes transformer-based cross-species self-supervised pretraining combined with audio-LLM end-to-end decoding as a new paradigm for speech BCIs achieving top decoding accuracy and cross-task generalization, though practical real-time use and robustness require further work.",
      "canon_before": "Prior speech BCIs relied on cascaded RNN encoders decoding phonemes combined with n-gram language models. Limited or no large scale pretraining, transformers not widely used, end-to-end speech decoding with LLMs was emerging but still relied on RNNs and lacked large-scale neural data integration.",
      "delta_from_canon": "Introduces a pretrained transformer-based neural encoder trained cross-species and cross-task with self-supervised masked modeling, integrated end-to-end with large audio-LMs via contrastive alignment to decode neural activity directly to text, improving accuracy and enabling imagined speech decoding and cross-task generalization.",
      "position_in_field": "Advances state-of-the-art in intracortical speech BCIs by enabling end-to-end transformer and LLM-based decoding with pretrained neural encoders and cross-modal alignment, moving beyond cascaded and RNN-based methods.",
      "practical_value": "High potential for improving speech neuroprosthetics accuracy and enabling imagined speech decoding, but current latency and generalization limitations constrain immediate real-world deployment.",
      "axes_moved": "Input modality moved from RNN-based phoneme decoding to transformer-based encoder with cross-species self-supervised pretraining; output shifted from phoneme-level to direct neural-to-text end-to-end decoding; problem framing moved from cascaded systems to joint end-to-end differentiable optimization integrating neural and language model modalities.",
      "axes_unresolved": "Remaining generalization to unseen subjects is unclear; full real-time suitability and robust long-term deployment with larger LLMs remain open; addressing neural plasticity and co-adaptation is future work.",
      "axes_regressed": "Decoding latency for end-to-end method is slower (approx. 0.95s per sentence) than cascaded (0.24s); bidirectional attention in encoder limits online real-time decoding currently.",
      "technical_limits": "End-to-end latency and computational requirements; bidirectional attention inhibits online decoding; large pretrained LLMs require substantial resources; need for large labeled and unlabeled datasets; limited real-time applicability currently.",
      "evaluation_limits": "Evaluations focused on two human participants (T12, T15) and limited imagined speech vocabulary; unknown performance on larger, more diverse subject sets; small batch sizes for contrastive learning may limit modality alignment effectiveness; inference strategies like nucleus sampling could be further optimized.",
      "deployment_limits": "End-to-end model latency (~0.95s per sentence) and requirement for bidirectional attention constrain real-time deployment; larger LLMs unsuitable for on-device use; dependency on large unlabeled and labeled datasets for pretraining and fine-tuning.",
      "scope_limits": "Limited to Utah array intracortical neural recordings, tested only on two human participants and associated monkey data; generalization to other populations or hardware untested.",
      "task": "speech-recognition",
      "input_modality": "electrophysiological neural spiking activity (Utah arrays)",
      "sensor_hardware": "Utah microelectrode arrays with thresholded spikes and spike-band power (SBP) features",
      "body_site": "brain",
      "output_type": "text",
      "vocabulary_type": "natural language English text",
      "vocabulary_size": "Up to 125,000 words in attempted speech dataset; 50 words for imagined speech dataset",
      "metrics": "Word Error Rate (WER) on Brain-to-Text ’24 and ’25 benchmarks; Phoneme Error Rate (PER) in phoneme decoding; Representational Similarity Analysis scores; processing latency per sentence (approx. 0.95s end-to-end, 0.24s cascaded)",
      "evaluation_mode": "Quantitative performance evaluation on held-out datasets (word error rate), ablation studies, representational similarity analysis, and decoding error analyses.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We introduce an end-to-end BraIn-to-Text (BIT) framework that translates neural activity into coherent sentences using a single differentiable neural network with a cross-task, cross-species pretrained neural encoder, supporting both attempted and imagined speech decoding.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Abstract, Introduction, Methods, Experiments, Discussion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "BIT Cascaded achieves state-of-the-art WER of 6.35% on Brain-to-Text ’24 hold-out, outperforming previous best 7.98%, and BIT End-to-End reduces prior end-to-end WER from 24.69% to 10.22%.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Evaluation",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The neural encoder is a transformer pretrained with self-supervised masked modeling on 367 hours of human and monkey Utah array neural data across speech and motor tasks.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Methods",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Speech decoding was conducted on Brain-to-Text Benchmark ’24 and ’25 intracortical Utah array datasets with two human participants (T12 and T15) for attempted speech and a smaller imagined speech dataset involving the same individuals.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Methods, Evaluation",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Phoneme decoding uses a 41-token vocabulary including phonemes plus blank and silence tokens, with phoneme error rates (PER) correlated with word error rates (WER) after decoding.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Appendix",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "End-to-end decoding requires about 0.95 seconds per sentence on average, slower than cascaded decoding at 0.24 seconds, limiting real-time applicability; bidirectional attention in the neural encoder is unsuitable for online decoding.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Discussion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "LLMs of larger scale than 1.5B parameters used here cannot run on-device, limiting mobile real-time applications.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Discussion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "This work combines cross-species, cross-task transformer-based self-supervised pretraining with an audio-LLM end-to-end decoder for neural speech decoding, a novel integration beyond prior cascaded, RNN-based, or purely task-specific models.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Full text",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluations were conducted on two particular human participants (T12 and T15) for attempted speech with large vocabularies and on imagined speech with a reduced 50-word vocabulary, limiting generalization scope.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Evaluation",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Contrastive learning is employed to align neural and text embeddings in a shared latent space to improve cross-modal alignment and decoding performance.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Methods",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "This model was pretrained on thresholded spikes and spike-band power (SBP) features from Utah array intracortical recordings.",
          "evidence_source": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "source_ref": "inputs/ssi_fulltext/text/arxiv_2511-21740-a-cross-species-neural-foundation-model-for-end-to-end-speech-decoding.txt",
          "section_or_location": "Methods",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_sonicvisionlm-playing-sound-with-vision-language-models",
      "slug": "sonicvisionlm-playing-sound-with-vision-language-models",
      "title": "SonicVisionLM: Playing Sound with Vision Language Models",
      "year": 2024,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zhifeng Xie",
        "Shengye Yu",
        "Mengtian Li",
        "Qile He",
        "Chaofeng Chen",
        "Yugang Jiang"
      ],
      "url": "https://nao-ki-mura.com/paper/sonicvisionlm-playing-sound-with-vision-language-models",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2401.04394",
      "arxiv_url": "https://arxiv.org/abs/2401.04394",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:multimodal",
        "modality:video",
        "output:audio",
        "task:dataset"
      ],
      "expert_take_short": "A high-quality video-to-audio generation framework leveraging vision-language models for editable, temporally precise sound effect generation; strong experimental validations but outside standard SSI scope.",
      "expert_take_long": "SonicVisionLM advances video-to-audio generation by creatively decomposing the task: a vision-language model proposes plausible sound event labels from silent video frames, a timestamp detection module accurately localizes events in time, and a novel time-controllable latent diffusion adapter conditions generation on synchronized text and timing inputs. The extensive CondPromptBank dataset supports training, focused on single sound effects with precise timestamps and rich textual descriptions. Quantitatively, the model outperforms prior state-of-the-art CondFoleyGen and other baselines in all key metrics, including semantic match, timing accuracy, and subjective human ratings for alignment and quality. The system’s multi-soundtrack generation supports user customization to add off-screen sounds, mirroring professional post-production workflows. Despite these advances, the authors acknowledge the need to enhance visual understanding and timestamp prediction further and the lack of real-time or mobile deployment readiness. This work represents a substantial contribution to controllable audiovisual synthesis but is outside SSI core, focusing on video-derived sound effects rather than silent speech sensing.",
      "expert_true_value": "Introduces a modular audiovisual sound synthesis pipeline that replaces monolithic video-to-audio regression with interpretable video-to-text event detection plus timestamped text-to-audio generation, enabling strong synchronization and user-editable sound design.",
      "canon_before": "Prior video-sound generation methods directly aligned video features to audio as a monolithic regression task, often yielding poor semantic matching, timing synchronization, and lacking user-editable control.",
      "delta_from_canon": "Transforms video-sound generation into a decomposed pipeline of video-to-text event detection (via VLM), timestamp sound localization, and time-conditioned text-to-audio generation with user-editable controls.",
      "position_in_field": "A strong audiovisual generation and dataset contribution that is adjacent to but not within core silent speech interface research.",
      "practical_value": "High practical utility for video post-production workflows and controllable sound effect generation; low direct relevance to silent speech interface applications.",
      "axes_moved": "vlm_decomposition; timestamp_control; editable_generation; dataset_scale",
      "axes_unresolved": "visual_understanding; timestamp_detection; broader_audio_control",
      "axes_regressed": "",
      "technical_limits": "Visual understanding and timestamp detection require further refinement; broader control of audio generation and editability beyond current focus remains an open challenge; model computationally intensive and not real-time.",
      "evaluation_limits": "Benchmarks use zero-shot datasets with mixed or noisy audio-ground truths complicating metric interpretation; subjective metrics complement objective evaluations but lack broad unseen word/generalization studies.",
      "deployment_limits": "Designed for post-production workflows; lacks real-time interactive authoring system and requires GPU-level compute; relies on offline processing and does not address mobile or embedded deployment.",
      "scope_limits": "An audiovisual generation paper outside the silent speech interface domain; focuses on sound effects for video post-production rather than silent speech communication devices or sensing.",
      "task": "audio-classification; dataset; multimodal_generation",
      "input_modality": "video",
      "sensor_hardware": "video camera frames only; no specialized silent speech sensors",
      "body_site": "",
      "output_type": "audio",
      "vocabulary_type": "Open text prompts grounded by sound-effect categories and timestamped event descriptions",
      "vocabulary_size": "10276 sound effect entries across 23 categories in CondPromptBank dataset",
      "metrics": "Exact reported values include conditional generation CLAP-top scores (36.8% and 42.8%), Onset Accuracy (27.6%), Onset AP (78.1%), Time Accuracy (43.8%), Intersection over Union (39.7%). Unconditional generation IoU scores of 39.5 and 42.0 on Greatest Hits and CountixAV datasets respectively. Subjective scores of Overall Audio Quality (75), Relevance (69), and Time-Sync (87). Also FID scores under 25 and MKL scores around 2.3 for time-conditioned adapter ablation.",
      "evaluation_mode": "Quantitative objective metrics (CLAP-top, Onset accuracy, IoU, FID, MKL), zero-shot on Greatest Hits and CountixAV datasets; 300-person large-scale subjective study evaluating audio quality, relevance, and synchronization; ablation on time-conditioned adapter.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We propose a novel framework called SonicVisionLM and collect a dataset CondPromptBank specifically for training a time-controllable adapter. It ensures the generated sound aligns perfectly with our text input and maintains precise timing control.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sonicvisionlm-playing-sound-with-vision-language-models-sonicvisionlm-playing-sound-with-vision-language-models.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sonicvisionlm-playing-sound-with-vision-language-models-sonicvisionlm-playing-sound-with-vision-language-models.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "SonicVisionLM decomposes video-sound generation into video-to-text event detection using a vision-language model, timestamp prediction with a ResNet(2+1)-D18 network, and a time-controllable latent diffusion adapter for text-to-audio generation allowing user edits.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sonicvisionlm-playing-sound-with-vision-language-models-sonicvisionlm-playing-sound-with-vision-language-models.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sonicvisionlm-playing-sound-with-vision-language-models-sonicvisionlm-playing-sound-with-vision-language-models.txt",
          "section_or_location": "3. Method",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Conditional video-to-audio generation achieves CLAP-top scores of 36.8% and 42.8%, Onset Accuracy 27.6%, Onset AP 78.1%, Time Accuracy 43.8%, and IoU 39.7%. Unconditional generation IoU scores are 39.5 on Greatest Hits and 42.0 on CountixAV datasets. Subjective scores from 300 evaluators were OVL 75, REL 69, and Time-sync 87.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sonicvisionlm-playing-sound-with-vision-language-models-sonicvisionlm-playing-sound-with-vision-language-models.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sonicvisionlm-playing-sound-with-vision-language-models-sonicvisionlm-playing-sound-with-vision-language-models.txt",
          "section_or_location": "4. Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The authors explicitly acknowledge visual understanding and timestamp detection modules still require refinement; wider audio generation control beyond current time synchronization is an open challenge; model is not real-time and targets post-production use rather than live deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sonicvisionlm-playing-sound-with-vision-language-models-sonicvisionlm-playing-sound-with-vision-language-models.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sonicvisionlm-playing-sound-with-vision-language-models-sonicvisionlm-playing-sound-with-vision-language-models.txt",
          "section_or_location": "5. Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases",
      "slug": "ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases",
      "title": "IR-UWB Radar-Based Contactless Silent Speech Recognition of Vowels, Consonants, Words, and Phrases",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases",
      "doi": "10.1109/ACCESS.2023.3344177",
      "doi_url": "https://doi.org/10.1109/ACCESS.2023.3344177",
      "arxiv_id": "2312.09572",
      "arxiv_url": "https://arxiv.org/abs/2312.09572",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:radar",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "deployment:hands-free",
        "deployment:mobile-suitable"
      ],
      "expert_take_short": "This paper introduces FERASEC, a novel radar feature extraction enabling the first contactless IR-UWB radar phoneme-level silent speech recognition with 86% vowel and 81% consonant accuracy, surpassing raw signal baselines and signifying a key advance in practical silent speech interfaces.",
      "expert_take_long": "This work presents the first successful demonstration of contactless phoneme-level silent speech recognition using IR-UWB radar by introducing FERASEC, a radar-specific feature extraction method that transforms raw 2D radar frames into abbreviated envelopes, effectively capturing articulator movements despite noise and clutter. Comprehensive evaluations on 8 vowels, 11 consonants, 25 words, and 12 phrases show classification accuracies up to 86.47% for vowels and 81.59% for consonants using DNN–HMM classifiers with radar antenna positioned near the lips, outperforming baseline methods and prior radar SSR studies. The use of two radar placements reveals that upper lip-positioned radar better captures critical lip motions, necessary for phoneme discrimination. Although raw radar data or end-to-end deep learning on raw input perform poorly (<50%), engineered FERASEC features enable robust recognition. Controlled prompted speech data from 20 participants supports strong validation of phoneme recognition capability, although conversational and open-vocabulary scenarios remain untested. The study insightfully discusses practical deployment issues such as sensor alignment aiding and potential integration into consumer devices, marking a significant advance toward real-world contactless silent speech interfaces.",
      "expert_true_value": "A radar-specialized feature extraction and system design that enables realistic contactless phoneme-level silent speech recognition, a milestone beyond previous radar SSR word-only demos, verified with a large experimental dataset and classifier comparisons.",
      "canon_before": "Prior radar-based SSR showed limited small word-level demos with weak or unproven phoneme-level contactless recognition.",
      "delta_from_canon": "Introduces FERASEC and demonstrates meaningful contactless IR-UWB radar SSR accuracy on phonemes, words, and phrases with radar antenna placement and aiding logic considerations.",
      "position_in_field": "Core contactless SSI work establishing radar phoneme recognition feasibility with pragmatic sensor placement and aiding algorithms.",
      "practical_value": "High potential for hands-free SSR in mobile or wearable devices if hardware integration and user alignment challenges are solved.",
      "axes_moved": "contactless_ssr; phoneme_level_radar; mobile_embedding_potential",
      "axes_unresolved": "open_vocabulary; conversational_use; broader_speaker_variability",
      "axes_regressed": "",
      "technical_limits": "Performance affected by articulator alignment; signal blockage by teeth can obscure tongue motion with upper radar; FERASEC depends on effective clutter mitigation and engineered transforms.",
      "evaluation_limits": "Controlled lab study with prompted closed-set items, limited speaker diversity (mostly Korean with American English speakers), no open vocabulary or conversational speech tested.",
      "deployment_limits": "Needs robust user-position handling, mitigating signal blockage (e.g. teeth), and hardware integration to small devices for daily use.",
      "scope_limits": "Closed-set prompted isolated and short phrase recognition; no open vocabulary, spontaneous speech, or broader speaker variability tested.",
      "task": "speech-recognition",
      "input_modality": "radar",
      "sensor_hardware": "Upper and lower IR-UWB radar sensors; upper uses patch antennas facing lips, lower uses sinuous antennas below chin.",
      "body_site": "lip",
      "output_type": "text",
      "vocabulary_type": "8 vowels, 11 consonants, 25 words, and 12 phrases",
      "vocabulary_size": "56 speech items across four task sets",
      "metrics": "Classification accuracy evaluated via leave-one-out cross-validation across 20 reps per phoneme/word/phrase; vowels achieved 86.47%, consonants 81.59%, words 88.95%, phrases 96.88% with FERASEC + DNN-HMM upper radar.",
      "evaluation_mode": "Leave-one-out cross-validation across repeated prompted articulation; separate experiments per speech unit type and radar position.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "This study represents the first demonstration of phoneme-level SSR using contactless radar and proposes a novel speech feature extraction algorithm specifically designed for IR-UWB radar-based SSR.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.99,
          "statement": "The study uses a data set of 20 participants pronouncing 8 vowels, 11 consonants, 25 words, and 12 phrases, with 20 repetitions each, recorded using two IR-UWB radar antennas positioned in front of lips and below chin.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "section_or_location": "A SPEECH STIMULI",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.98,
          "statement": "Introduces FERASEC, a novel radar-specific speech feature extraction algorithm reducing dimensionality of IR-UWB radar frames into envelope features capturing articulator movement, essential for phoneme-level SSR accuracy.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "section_or_location": "A PROPOSED FEATURE EXTRACTION ALGORITHM",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.98,
          "statement": "FERASEC utilizes concatenation of raw and clutter-reduced radar frames, envelope detection, downsampling, DC removal, plus first and second derivatives, creating six-dimensional feature set for classification.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "section_or_location": "A PROPOSED FEATURE EXTRACTION ALGORITHM",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Evaluation conducted on 20 native or semi-native English-speaking participants trained on closed-set prompted 8 vowels, 11 consonants, 25 phonetically balanced words, and 12 phrases repeated 20 times with leave-one-out cross validation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "section_or_location": "A SPEECH STIMULI",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "FERASEC combined with DNN-HMM and upper radar antenna placement yields recognition accuracies of 86.47% vowels, 81.59% consonants, 88.95% words, and 96.88% phrases.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "section_or_location": "HMM",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.98,
          "statement": "Without engineered feature extraction (FERASEC), using raw or clutter-reduced frame sets alone for end-to-end DNN-HMM phoneme recognition yields accuracies below 50%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "section_or_location": "1 Necessity of Developing a Feature Extraction Algorithm for IR",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.95,
          "statement": "Although current proof-of-concept uses a hardware testbed, IR-UWB radar modules based on CMOS transceiver chips and antenna integration feasibility imply potential embedding of contactless SSR into smartphones or wearables.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "section_or_location": "BASED SSR STUDIES",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.95,
          "statement": "A supporting aiding algorithm checks that the user's lips are properly aligned and within radar detection range before silent speech capture, improving usability for consumer device integration.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-words-and-phrases-ir-uwb-radar-based-contactless-silent-speech-recognition-of-vowels-consonants-wo.txt",
          "section_or_location": "2 IR",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency",
      "slug": "ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency",
      "title": "Ultrasensitive Textile Strain Sensors Redefine Wearable Silent Speech Interfaces with High Machine Learning Efficiency",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Chenyu Tang",
        "Muzi Xu",
        "Wentian Yi",
        "Zibo Zhang",
        "Edoardo Occhipinti",
        "Chaoqun Dong",
        "Dafydd Ravenscroft",
        "Sung-Min Jung",
        "Shuo Gao",
        "Jong Min Kim",
        "Luigi G. Occhipinti"
      ],
      "url": "https://nao-ki-mura.com/paper/ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2311.15683",
      "arxiv_url": "https://arxiv.org/abs/2311.15683",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:throat",
        "modality:vibration",
        "task:command-recognition",
        "output:labels",
        "deployment:hands-free",
        "deployment:real-time",
        "deployment:speaker-dependent",
        "deployment:wearable",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Strong SSI system combining a novel ultrasensitive throat textile strain sensor with an efficient 1D residual CNN, achieving high word classification accuracy with low computational cost and promising few-shot transfer to new users and words on small vocabularies.",
      "expert_take_long": "This paper presents a significant hardware-software co-design for wearable silent speech interfaces by innovating an ultrasensitive graphene-coated textile strain sensor with ordered microcracks integrated into a throat choker, dramatically enhancing sensor sensitivity (gauge factor 317 within 5% strain) and durability (>10,000 cycles). This enhanced signal information density enables a lightweight 1D residual CNN to maintain high decoding accuracy (up to 95.25% on 20 words) while reducing computational load by 90%. The model's robustness is further enhanced by a novel random noise window augmentation method, allowing noise resilience without conventional filtering. The authors validate the system on multiple datasets containing frequent, confusable, and slow/fast-spoken words from 3 participants, achieving strong accuracies and demonstrating promising few-shot transfer learning to new users and unseen words with only 30 samples per class (90% accuracy). These results show a clear advance in balancing wearable comfort, decoding accuracy, and energy efficiency. However, the study's limitations include small user numbers, closed vocabularies, and word-level classification only, leaving questions about continuous speech decoding and generalizability in diverse real-world scenarios. Overall, the paper substantially advances wearable SSIs through sensor-model synergy and outlines a clear trajectory for scaling to practical deployment with larger vocabularies and user bases.",
      "expert_true_value": "The work's core contribution is the hardware-software synergy where improving sensor sensitivity and signal quality via ordered graphene microcracks enables a compact, low-complexity neural network to attain high decoding accuracy and robustness, reducing reliance on multi-channel or heavy models and making practical wearable SSI more feasible.",
      "canon_before": "Most wearable SSI rely on either multi-channel sensor arrays or complex multidimensional modeling to compensate for less sensitive sensors, which increases computational load, reduces comfort, or limits practicality.",
      "delta_from_canon": "Introduces an ultrasensitive textile strain sensor based on ordered graphene microcracks with 420% gauge factor improvement at ≤5% strain, enabling high information density signals that allow a lightweight residual 1D CNN to achieve high accuracy and reduce computational demands by 90%.",
      "position_in_field": "Advances wearable throat strain sensing SSIs emphasizing comfort, efficiency, and accuracy through sensor-model integration, contrasting prior multi-channel or computationally heavy SSI systems.",
      "practical_value": "Demonstrates a practical low-power wearable SSI system balancing comfort, robustness, efficient inference, and high silent speech decoding accuracy, underscoring a viable path to real-world silent communication.",
      "axes_moved": "modality; system_design; deployment; evaluation",
      "axes_unresolved": "larger vocabulary; sentence-level decoding; broader user populations",
      "axes_regressed": "",
      "technical_limits": "Small participant number (3), limited closed vocabulary size (max 20 words), isolated word classification rather than continuous speech or sentence decoding.",
      "evaluation_limits": "Evaluation limited to isolated word classification on small datasets and three participants; speech speed variation tested only on few long words; transfer learning tested on few samples per class only.",
      "deployment_limits": "Limited to small participant number (3) and small closed vocabularies; lacks continuous speech or sentence-level decoding; transfer learning evaluated only on small samples and selected new words/users.",
      "scope_limits": "Limited to silent word recognition from throat strain signals in small vocabularies; does not address continuous speech decoding or sentence-level understanding.",
      "task": "silent speech word recognition",
      "input_modality": "throat vibration strain",
      "sensor_hardware": "Graphene-coated textile strain sensor with ordered microcracks integrated into a throat choker worn around the neck.",
      "body_site": "throat",
      "output_type": "labels",
      "vocabulary_type": "closed-word classification",
      "vocabulary_size": "20 high-frequency words + 10 confusable word pairs + 5 long words at varying speeds",
      "metrics": "Textile strain sensor gauge factor 317 within 5% strain; reliable over 10,000 stretch cycles; classification accuracy 95.25% (20 words Dataset 1), 93% (10 confusable words Dataset 2), 96% (5 long words Dataset 3); few-shot transfer accuracy 90% with 30 samples per class.",
      "evaluation_mode": "Hardware sensing with textile strain sensor integrated into choker, 500 Hz sampling; end-to-end word classification on three datasets with multiple noise and wearer conditions plus few-shot transfer evaluation.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "We developed a biocompatible strain sensor integrated into a comfortable textile choker capable of enduring over 10,000 stretching-releasing cycles and featuring ordered microcracks of graphene flakes embedded in textile substrates, resulting in unprecedented sensitivity and high information density signals.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.98,
          "statement": "The sensor's gauge factor exceeds previous state-of-the-art strain sensors by 420%, enabling detection of subtle throat micromovements and producing information-rich signals that simplify decoding and enable lightweight 1D CNN processing.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.9,
          "statement": "Evaluations were performed on three English datasets collected from three participants including 20 common high-frequency words; 10 confusable word pairs; and 5 long words spoken at varied speeds with 100 samples each (80 train / 20 test). Few-shot transfer to a new user and 10 unseen words was demonstrated using 30 samples per class achieving 90% accuracy.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "section_or_location": "II. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.95,
          "statement": "Classification accuracy is 95.25% on Dataset 1 with 20 words, 93% on Dataset 2 with confusable pairs, 96% on Dataset 3 with long words at varied speech speeds; few-shot transfer learning achieves 90% accuracy with only 30 samples per class. The sensor gauge factor is 317 within 5% strain with stable linear resistance response and durability over 10,000 stretching cycles.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "The study features only three participants, evaluates closed vocabularies (max 20 words), and focuses on isolated word classification rather than continuous speech or sentence decoding; transfer learning evaluated on limited samples and classes; real-world deployment would require broader vocabularies and natural language tasks.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "section_or_location": "II. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "The wearable silent speech interface enables hands-free, real-time, speaker-dependent recognition using a choker-mounted textile strain sensor and an efficient 1D CNN, achieving both comfort and computational efficiency suitable for mobile applications.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces-with-high-machine-learning-efficiency-ultrasensitive-textile-strain-sensors-redefine-wearable-silent-speech-interfaces.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation",
      "slug": "distributed-pressure-matching-strategy-using-diffusion-adaptation",
      "title": "Distributed pressure matching strategy using diffusion adaptation",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/distributed-pressure-matching-strategy-using-diffusion-adaptation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2311.07729",
      "arxiv_url": "https://arxiv.org/abs/2311.07729",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "modality:microphone",
        "output:audio",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Distributed rootless pressure matching for personal sound zones is presented and validated in simulation, not an SSI paper.",
      "expert_take_long": "This paper presents a technical contribution in the domain of personal sound zones by reformulating the pressure matching method under a diffusion LMS distributed adaptation framework. The key contribution is distributing pressure matching control computation across multiple acoustic nodes without relying on a single root node, a limitation of previous distributed ACC approaches. Through extensive simulations using a rectangular room with simulated RIRs and noise-perturbed ATFs, the algorithm shows steady-state performance comparable to centralized pressure matching, achieving NMSE around -16 dB and acoustic contrast around 16 dB on control points after 5000 iterations. The algorithm thus offers a plausible way to scale sound zone control in large networks by distributing computation and communication load. However, the scope is clearly room acoustic control and does not address speech or articulation sensing, so it lies outside the silent speech interfaces domain. Deployment readiness is limited by lack of real environment tests, and practical challenges such as synchronization and online ATF measurement remain open. Still, this paper represents a meaningful advance for distributed sound field control in the acoustic domain.",
      "expert_true_value": "A credible acoustics systems paper introducing diffusion LMS for distributed personal sound zone control, achieving comparable performance to centralized PM without a root node dependency.",
      "canon_before": "Personal sound-zone control normally relies on centralized pressure matching or distributed acoustic contrast control (ACC) variants that require a root node for global coordination.",
      "delta_from_canon": "Recast pressure matching as a sum of local costs for each node and apply diffusion LMS adaptation allowing each node to compute locally and share only with neighbors, eliminating the root node bottleneck.",
      "position_in_field": "Outside SSI; relevant as adjacent acoustic system work",
      "practical_value": "Useful for scalable personal audio control scenarios requiring distributed computation and communication across loudspeaker and microphone nodes.",
      "axes_moved": "distributed_control; scalability; computational_load_distribution",
      "axes_unresolved": "real_room_robustness; synchronization; time_varying_ATFs",
      "axes_regressed": "",
      "technical_limits": "Purely simulation results; computational cost grows with number of microphones per node; no robustness evaluation for practical ATF time variations or synchronization errors.",
      "evaluation_limits": "Evaluation is purely simulation-based with modeled room, no real acoustic measurements or user studies.",
      "deployment_limits": "No real-room implementations; requires synchronization, calibration, and stable ATF measurements in practical deployments.",
      "scope_limits": "Focus fully on room acoustic personal sound zones; no speech input, articulation sensing, or silent speech tasks included.",
      "task": "sound-zone control",
      "input_modality": "acoustic",
      "sensor_hardware": "Distributed acoustic nodes each equipped with multiple microphones and loudspeakers",
      "body_site": "",
      "output_type": "audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Normalized Mean Square Error (NMSE) and Acoustic Contrast (AC) at control and validation points; single-frequency and multi-frequency tests showing steady-state NMSE ~ -16 dB and AC ~ 16 dB on control points after 5000 iterations.",
      "evaluation_mode": "Monte Carlo simulations over 100 runs using synthetic RIRs and random noise perturbations; multi-frequency test from 100 to 4000 Hz; comparison against centralized pressure matching baseline.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "This paper presents a distributed pressure-matching (PM) method relying on diffusion adaptation (DPM-D) to spread the computational load amongst nodes in order to overcome centralized method limitations.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The diffusion adaptation approach creates a distributed solution requiring only local exchanges, eliminating the need for a root node seen in prior distributed ACC algorithms.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "section_or_location": "1. INTRODUCTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.9,
          "statement": "At steady state on control points, the NMSE and AC are approximately -16 dB and 16 dB respectively; on validation points NMSE and AC are about -14 dB and 14 dB; performances comparable to centralized PM after 5000 iterations.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "section_or_location": "4. SIMULATIONS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.9,
          "statement": "Evaluations are performed using 100 Monte Carlo runs over a simulated rectangular room of size 8.088m x 7.346m x 2.865m with T60 ≈ 200 ms; using 9 loudspeakers, 16 microphones arranged in 2 distributed system topologies; perturbations added to ATFs; signals are multi-frequency bins from 100 to 4000 Hz.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "section_or_location": "4. SIMULATIONS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.85,
          "statement": "Evidence is simulation only with no real-room deployment or user studies; practical deployment would need synchronization, calibration, and robust ATF measurement.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "section_or_location": "5. CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.85,
          "statement": "The proposed method surpasses limitations of prior distributed ACC approaches which required a root node, enabling each node to independently estimate and share information with neighbors, enhancing stability and removing root node failure risk.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_distributed-pressure-matching-strategy-using-diffusion-adaptation-distributed-pressure-matching-strategy-using-diffusion-adaptation.txt",
          "section_or_location": "1. INTRODUCTION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts",
      "slug": "advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts",
      "title": "Advancing Test-Time Adaptation for Acoustic Foundation Models in Open-World Shifts",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Hongfu Liu",
        "Hengguan Huang",
        "Ye Wang"
      ],
      "url": "https://nao-ki-mura.com/paper/advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2310.09505",
      "arxiv_url": "https://arxiv.org/abs/2310.09505",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "modality:acoustic",
        "output:text",
        "task:speech-recognition"
      ],
      "expert_take_short": "Strong acoustic ASR paper proposing confidence-weighted frame adaptation plus temporal consistency regularization for stable test-time adaptation under wild acoustic conditions, yielding substantial WER improvements across noise, accents, and singing datasets.",
      "expert_take_long": "This paper presents a significant advancement in test-time adaptation for acoustic foundation models under complex wild acoustic shifts. It identifies that many high-entropy frames within non-silent speech segments, previously considered unreliable and discarded, contain valuable semantic content. The authors propose Confidence-Enhanced Adaptation that weights these noisy frames by their entropy-based confidence to adapt feature extractor parameters effectively, coupled with short-term consistency regularization exploiting speech temporal coherence. Experiments reveal that on Gaussian noise corrupted LibriSpeech test-other, their method achieves an average relative WER improvement of 21.5% over the unadapted model and even 41.7% relative improvement at 5 dB SNR Air Conditioner noise. For real-world shifts like L2 accents and singing voice, their method consistently outperforms baselines including Tent, SAR, TeCo, and SUTA across Wav2vec2 Base and Large models. Ablation studies confirm the confidence weighting as the core contribution with additional benefit from temporal regularization. Generalization is demonstrated on Conformer CTC and Transducer models. The approach is near real-time with adaptation latency around 1.07 seconds but remains offline episodic; streaming adaptation and decoder or language model text adaptation remain open challenges. The work is outside traditional SSI scope as it focuses strictly on acoustic speech ASR robustness and adaptation rather than silent speech decoding or broader speech tasks. Overall, it provides an important and well-validated method for enhancing online ASR robustness under unpredictable wild acoustic conditions.",
      "expert_true_value": "Demonstrates that noisy yet semantically critical high-entropy frames in non-silent speech should be leveraged with confidence-aware weighting during online adaptation rather than discarded, substantially improving robustness of acoustic foundation models to diverse real-world shifts.",
      "canon_before": "Previous ASR robustness work focused on handling individual corruptions, relying on discarding noisy frames or static vision TTA adaptations that treat samples as independent. Open-world acoustic shifts with high-entropy frames and temporal speech coherence remained unaddressed for stable TTA.",
      "delta_from_canon": "Treats high-entropy noisy frames within non-silent speech as valuable adaptation targets weighted by confidence rather than filtering them out, combined with temporal consistency regularization for stable frame-level adaptation.",
      "position_in_field": "Strong recent open-world test-time adaptation study for acoustic foundation models in speech recognition under diverse real-world acoustic shifts.",
      "practical_value": "Provides a practically useful test-time adaptation approach to enhance robustness of deployed ASR systems against environmental noises, accents, and singing without retraining or source data access.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "text-domain adaptation; broader task transfer beyond ASR; true streaming deployment",
      "axes_regressed": "",
      "technical_limits": "Decoder and language model side text-domain adaptation remain unaddressed; no exploration of multi-speaker or cross-task generalization; adaptation latency of about one second restricts immediate streaming deployment.",
      "evaluation_limits": "Evaluated on established ASR fine-tuned acoustic models on synthetic noise, environmental sounds, accents, and singing voice datasets. Leaves decoder side and language model text-domain adaptation as open problems. Real streaming scenarios not tested.",
      "deployment_limits": "Designed for offline episodic utterance-level TTA with ~1.07s adaptation latency and 1.20s recognition runtime on A5000 GPU. Does not address streaming deployment, decoder text-domain adaptation, or broader task transfer beyond ASR.",
      "scope_limits": "Acoustic speech recognition adaptation under wild acoustic shifts including noise, environmental sounds, accents, and singing voice; does not cover silent speech or broader speech understanding tasks.",
      "task": "speech-recognition",
      "input_modality": "acoustic",
      "sensor_hardware": "microphone",
      "body_site": "",
      "output_type": "text",
      "vocabulary_type": "open ASR vocabularies",
      "vocabulary_size": "",
      "metrics": "Achieves 21.5% average relative WER improvement over source model on Gaussian noise corruption (LibriSpeech LS-C); 41.7% relative improvement at 5 dB SNR Air Conditioner noise; 1.07 s adaptation latency and 1.20 s recognition runtime on A5000 GPU; consistently lower WER than baselines on L2 accents and singing voice datasets.",
      "evaluation_mode": "Multi-dataset word error rate (WER) benchmarking with ablation studies, cross-model generalization (Conformer, Transducer), latency measurements, and comparison to Whisper ASR model.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "Our method, Confidence-Enhanced Adaptation, performs frame-level adaptation using a confidence-aware weight scheme to avoid filtering out essential information in high-entropy frames, combined with consistency regularization to leverage short-term speech consistency.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "Confidence-Enhanced Adaptation reweights noisy non-silent frames via entropy-based confidence rather than discarding them, and applies short-term consistency regularization on frame representations for stable test-time adaptation of acoustic foundation models under wild acoustic shifts.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "section_or_location": "4 Method",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On the LibriSpeech test-other corruptions with Gaussian noise, the method achieves an average relative WER improvement of 21.5% over the unadapted source model; on Air Conditioner sound at 5 dB SNR, relative improvement reaches 41.7%; adaptation latency is about 1.07 seconds plus 1.20 seconds recognition runtime on A5000 GPU.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "section_or_location": "5 Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Validated on synthetic and real datasets including LibriSpeech corrupted by Gaussian noise and environmental sounds, L2-Arctic accented speech, and singing voice datasets DSing and Hansen, with multiple architectures (Wav2vec2 Base and Large, Conformer, Transducer) and decoding strategies (greedy and beam search).",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "section_or_location": "5 Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Decoder text-domain adaptation and broader task transfer beyond ASR remain open challenges. The method is not designed for streaming deployment with real-time constraints; adaptation latency around 1.07 seconds makes immediate streaming application limited.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shifts-advancing-test-time-adaptation-for-acoustic-foundation-models-in-open-world-shif.txt",
          "section_or_location": "6 Analysis",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_sound-source-localization-is-all-about-cross-modal-alignment",
      "slug": "sound-source-localization-is-all-about-cross-modal-alignment",
      "title": "Sound Source Localization is All about Cross-Modal Alignment",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Arda Senocak",
        "Hyeonggon Ryu",
        "Junsik Kim",
        "Tae-Hyun Oh",
        "Hanspeter Pfister",
        "Joon Son Chung"
      ],
      "url": "https://nao-ki-mura.com/paper/sound-source-localization-is-all-about-cross-modal-alignment",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2309.10724",
      "arxiv_url": "https://arxiv.org/abs/2309.10724",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:multimodal",
        "output:labels",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Provides a novel multi-positive contrastive framework enhancing semantic audio-visual alignment for sound source localization. Strong experimental evidence supports claims. Method is outside the SSI domain.",
      "expert_take_long": "This work critically examines the limitations of current sound source localization, which predominantly focuses on spatial alignment without ensuring that models truly understand semantic correspondences between audio and visual modalities. The authors propose a novel contrastive learning framework that constructs multiple positive pairs via multi-view augmentations and semantic nearest neighbor mining using pretrained encoders. This enables the model to jointly optimize sound localization and semantic alignment. Extensive experiments on popular benchmarks like VGGSound, SoundNet-Flickr, and AVSBench demonstrate consistent improvements in both localization accuracy and cross-modal retrieval. Importantly, the paper reveals that high localization performance alone does not guarantee semantic understanding, motivating the joint learning and evaluation approach. However, the work is orthogonal to silent speech interface (SSI) research, as it neither targets speech-specific modalities nor addresses speech-related tasks. The approach relies heavily on curated datasets and pretrained models for sample mining, limiting direct real-world deployment and generalization. Overall, this paper reframes sound source localization evaluation and training towards semantic audio-visual grounding, representing a valuable advancement in multimodal representation learning but with limited immediate impact on SSI.",
      "expert_true_value": "Clarifies that spatial localization metrics are insufficient proxies for genuine audio-visual grounding and introduces a training and evaluation protocol that ensures semantic alignment, improving localization quality and robustness.",
      "canon_before": "Prior sound source localization benchmarks emphasized spatial localization accuracy without ensuring genuine audio-visual semantic grounding, often relying on instance discrimination with limited positive pairs.",
      "delta_from_canon": "Reframes sound source localization as a joint task with cross-modal semantic alignment, employing multi-view and conceptually similar positive sets for contrastive learning.",
      "position_in_field": "Audio-visual representation learning and localization, distinct from SSI core literature.",
      "practical_value": "Valuable for multimodal audio-visual grounding systems that require joint semantic and spatial alignment; lacks direct applicability to speech interfaces or silent speech tasks.",
      "axes_moved": "evaluation; problem_reframing",
      "axes_unresolved": "Transfer to broader multimodal perception tasks; SSI relevance",
      "axes_regressed": "",
      "technical_limits": "Dependent on curated benchmarks and pretrained encoders for nearest neighbor positive mining; no real-world deployment or speech-related evaluation; relies on batch contrastive methods and choices in k nearest neighbors.",
      "evaluation_limits": "Evaluations are conducted on curated audio-visual datasets with bounding box or segmentation annotations; no evaluation on spontaneous human speech or silent speech tasks; open-set and false positive detection benchmarks are included but limited in real-world variability.",
      "deployment_limits": "No deployment pathway described; the approach produces localization and retrieval labels but is not integrated for real-time or speech interface use; no user-centric evaluation.",
      "scope_limits": "Focused solely on audio-visual source localization and semantic alignment; no treatment of speech or silent speech interfaces.",
      "task": "sound source localization; cross-modal retrieval",
      "input_modality": "audio + video",
      "sensor_hardware": "camera + microphone",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Localization measured by corrected Intersection over Union (cIoU) and Area Under Curve (AUC); retrieval evaluated by recall rates at top ranks (R@1, R@5, R@10); false positives assessed by Average Precision (AP) and maximum F1 scores on extended datasets.",
      "evaluation_mode": "Quantitative evaluation on localization and cross-modal retrieval tasks including ablation studies, open-set category tests, and false positive detection on extended benchmarks.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "We propose a cross-modal alignment task as a joint task with sound source localization to better learn the interaction between audio and visual modalities, achieving strong localization performance and cross-modal semantic understanding.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.95,
          "statement": "The method constructs multiple positive audio-visual pairs via augmentation and conceptually similar samples mined by pretrained encoders to enhance semantic alignment beyond single-instance discrimination typical of existing approaches.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "section_or_location": "3.3 Expanding with Multiple Positive Samples",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.95,
          "statement": "Evaluation is performed on large-scale datasets including VGGSound, SoundNet-Flickr, AVSBench S4, and their extended variants with bounding box or segmentation annotations; tasks include sound source localization and cross-modal retrieval with additional false positive detection benchmarks.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "section_or_location": "4. Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The paper reframes sound source localization from spatial alignment only to a joint task including semantic alignment, employing multi-positive contrastive learning and auxiliary cross-modal retrieval evaluation to capture semantic grounding not addressed by prior benchmarks.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "section_or_location": "1. Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.95,
          "statement": "The approach depends heavily on curated datasets with limited real-world variability, pretrained encoders for mining conceptually similar samples, and lacks real time, user-centered deployment or SSI specific evaluations or applications.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sound-source-localization-is-all-about-cross-modal-alignment-sound-source-localization-is-all-about-cross-modal-alignment.txt",
          "section_or_location": "5. Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos",
      "slug": "let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos",
      "title": "Let There Be Sound: Reconstructing High Quality Speech from Silent Videos",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ji-Hoon Kim",
        "Jaehun Kim",
        "Joon Son Chung"
      ],
      "url": "https://nao-ki-mura.com/paper/let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.15256",
      "arxiv_url": "https://arxiv.org/abs/2308.15256",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "Strong lip-to-speech system that reduces ambiguity via SSL linguistic conditioning, variance predictors, and flow-based refinement, achieving near-vocoded naturalness and improved intelligibility on standard datasets.",
      "expert_take_long": "This work proposes a high-quality lip-to-speech reconstruction system using only lip video inputs. It innovatively addresses the intrinsic one-to-many mapping challenge caused by homophenes and speech variability. The authors leverage intermediate-layer HuBERT self-supervised representations as linguistic predictor targets, explicitly model pitch and energy variance to capture prosodic richness, and incorporate a flow-based post-net to refine mel-spectrogram outputs that otherwise tend to be over-smoothed. Comprehensive experiments on the constrained GRID dataset and the larger Lip2Wav Chemistry and Chess datasets demonstrate state-of-the-art results in both perceptual naturalness (MOS only 0.28 below vocoded speech on GRID) and intelligibility (WER/CER improvements compared to prior methods). Ablation studies confirm the critical contributions of each component, especially the self-supervised linguistic predictor and flow post-net. However, the method remains a multi-stage research system without real-time capabilities or end-to-end simplicity, limiting immediate deployment readiness. Evaluation is limited to public datasets without open-environment testing. Overall, the paper significantly advances lip-to-speech quality as a reconstruction problem focused on ambiguity reduction, but leaves challenges of generalization, inference efficiency, and deployment for future work.",
      "expert_true_value": "It contributes a novel pipeline that disentangles linguistic content and acoustic variation from silent lip video using self-supervised representations and variance modeling, refined by flow-based post-processing, achieving state-of-the-art high-quality lip-to-speech reconstruction.",
      "canon_before": "Lip-to-speech quality was limited by homophenes, over-smoothed outputs, and weak prosodic control.",
      "delta_from_canon": "Employs SSL linguistic conditioning, explicit pitch and energy variance predictors, and a flow-based post-net refinement, moving beyond simple mel-spectrogram regression and inadequately modeled one-to-many ambiguities.",
      "position_in_field": "Notable advance in lip-to-speech synthesis enhancing ambiguity resolution and speech quality with SSL and flow-based methods, representing leading-edge progress in video-to-speech.",
      "practical_value": "Provides a strong lip-to-speech baseline with detailed component ablations and quantitative benchmarking useful for research and development of high-quality silent video speech synthesis systems.",
      "axes_moved": "ambiguity_reduction; speech_quality; prosody_modeling",
      "axes_unresolved": "real-time inference; broader generalization; simpler end-to-end deployment",
      "axes_regressed": "",
      "technical_limits": "Multi-stage pipeline; no latency or streaming evaluation; complex architecture not end-to-end; needs neural vocoder at inference.",
      "evaluation_limits": "Evaluated mainly on GRID and Lip2Wav subsets, which are relatively constrained and do not cover open-world, real-time, or cross-domain deployment scenarios.",
      "deployment_limits": "No real-time or on-device deployment targets; system is a multi-stage research pipeline without simplified inference or low-latency optimization.",
      "scope_limits": "Lip-to-speech reconstruction from silent video only; no multi-modal or audio-assisted input considered.",
      "task": "speech-reconstruction",
      "input_modality": "video",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "phonetic",
      "vocabulary_size": "GRID: 51; Lip2Wav: large vocabulary",
      "metrics": "MOS naturalness gap 0.28 and intelligibility gap 0.16 vs vocoded speech on GRID; WER 17.07%, CER 9.17% on GRID; WER and CER improvements over prior SOTA on Lip2Wav; pitch distribution moments and energy MAE metrics validate variance predictors.",
      "evaluation_mode": "MOS for naturalness and intelligibility, WER/CER via ASR transcription comparison, pitch-energy statistical analysis, and ablations.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We propose a novel lip-to-speech system that alleviates the one-to-many mapping problem by incorporating self-supervised speech representations to disambiguate homophenes, modeling acoustic variance for speech variation, and employing a flow-based post-net to refine synthesized speech details.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "section_or_location": "Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "Our method combines self-supervised linguistic targets from intermediate HuBERT layers, explicit variance predictors for pitch and energy, and a flow-based post-net to enhance speech reconstruction quality, addressing one-to-many ambiguity more explicitly than previous work.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "section_or_location": "Method",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On the GRID dataset, our model achieves a MOS gap from vocoded speech of only 0.28 in naturalness and 0.13 in intelligibility; WER and CER are 17.07% and 9.17% respectively, outperforming previous lip-to-speech methods.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "section_or_location": "Quantitative Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "We evaluate on the constrained GRID dataset and larger Lip2Wav Chemistry and Chess multi-speaker subsets; evaluation includes MOS naturalness/intelligibility, WER/CER via ASR, and pitch-energy statistics.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "section_or_location": "Experimental Settting",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The system is a multi-stage research pipeline without real-time or on-device deployment claims; end-to-end simplification and inference latency remain for future work.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "section_or_location": "Conclusion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Evaluations are limited to GRID and Lip2Wav datasets which do not cover open-world or real-time deployment, with manual transcription used in Lip2Wav due to lack of text labels.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "section_or_location": "Evaluation Metrics",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Each component independently contributes to quality: linguistic predictor clarifies homophenes improving intelligibility and errors; pitch and energy variance predictors improve prosodic modeling; flow post-net improves naturalness and detail.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "section_or_location": "Ablation Study",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Using the 12th layer of HuBERT-large with 200 K-means clusters for quantized linguistic representation yields the best intelligibility (lowest phoneme error rate) and contributes strongly to pronunciation accuracy in the model.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos-let-there-be-sound-reconstructing-high-quality-speech-from-silent-videos.txt",
          "section_or_location": "Analysis on Self",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video",
      "slug": "an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video",
      "title": "An Initial Exploration: Learning to Generate Realistic Audio for Silent Video",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Matthew Martel",
        "Jackson Wagner"
      ],
      "url": "https://nao-ki-mura.com/paper/an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.12408",
      "arxiv_url": "https://arxiv.org/abs/2308.12408",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "medium-high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "modality:video",
        "output:audio"
      ],
      "expert_take_short": "Honest exploratory comparison showing transformer-based model outperforms deep-fusion CNN and Wavenet for generating low-to-mid frequency audio from silent video in a small curated dataset; not a speech or SSI paper.",
      "expert_take_long": "This exploratory study systematically compares three state-of-the-art audio generation architectures conditioned on silent video: a deep-fusion CNN, a dilated Wavenet CNN with video embeddings, and an audio-video transformer. The transformer approach yields the best validation cross-entropy losses and qualitatively captures low and mid-frequency audio correlated to video events (e.g., car engines, clapping). Deep-fusion faces substantial boundary discontinuities and dominant unwanted frequencies, while Wavenet outputs resemble white noise or near silence. The study highlights the challenge of video-to-audio generation given the sparse visual cues and inherent ambiguities, and the limited dataset confines the models to narrow domain overfitting. The work's core contribution lies in benchmarking these architectures, illustrating that the transformer is the most promising starting point for further development. However, realism and fidelity remain limited, and the approach is far from deployable SSI speech or general video-to-audio systems. Future work requires larger, more varied data and extended training regimes to leverage the transformer architecture's full capacity.",
      "expert_true_value": "Offers a comparative negative/positive benchmark confirming transformer conditioning is relatively better for silent video audio synthesis, highlighting architectural pitfalls and setting directions for improvements rather than delivering a production system.",
      "canon_before": "Prior work included video-conditioned sound generation using SampleRNN and various encoder architectures, with limited architectural comparisons of deep-fusion CNN, Wavenet, and transformers for this task.",
      "delta_from_canon": "Frames contribution as a head-to-head architecture comparison study, introducing and validating transformer architecture for video-conditioned audio generation, rather than delivering a ready audio generation system.",
      "position_in_field": "Exploratory video-to-audio generation architecture comparison in the multimedia domain, outside traditional speech SSI tasks.",
      "practical_value": "Provides useful architecture baseline comparisons for future video-conditioned audio generation research, cautioning on limitations and common failure modes.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "dataset scale; nuanced waveform fidelity; generalization beyond three video types",
      "axes_regressed": "",
      "technical_limits": "Small, single-video training datasets, limited model capacity due to resource constraints, qualitative over quantitative evaluation, limited frequency fidelity, and overfitting.",
      "evaluation_limits": "Limited to three type-specific videos, trained on individual videos leading to overfitting, mostly qualitative assessment with validation cross-entropy as quantitative measure; no unseen diverse dataset evaluation.",
      "deployment_limits": "The dataset is small and limited in diversity, models are overfitted to single video types, qualitative evaluation predominates, and models fail on nuanced or high-frequency sounds, limiting real-world deployment potential.",
      "scope_limits": "Limited to generating Foley-like sound effects from small curated video sets; does not tackle speech or complex audio synthesis; exploratory preliminary study.",
      "task": "audio-generation-from-silent-video",
      "input_modality": "video",
      "sensor_hardware": "camera",
      "body_site": "",
      "output_type": "audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Validation cross-entropy losses per video and model (Table I): Transformer (car chase -0.22000, clapping -0.00797, nature -0.00862) outperforms Wavenet (car chase -0.03785, clapping 0.00029, nature 0.01669) and deep-fusion CNN (car chase 1.65133e-05, clapping -1.36272e-07, nature 1.04321e-05).",
      "evaluation_mode": "Validation cross-entropy loss comparison complemented by qualitative waveform and perceptual assessments on held-out video segments.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper aims to develop a deep learning framework that observes silent video sequences and generates realistic audio effects, exploring deep-fusion CNN, dilated Wavenet CNN, and transformer-based architectures, finding that the transformer yields the most promising results.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video-an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video-an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Validation cross-entropy loss Table I shows transformer architecture achieves -0.22000 on car chase, -0.00797 on clapping, and -0.00862 on nature videos, outperforming Wavenet and deep-fusion CNN.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video-an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video-an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The dataset is small and limited to a few curated video types from YouTube and homemade videos, models are trained per single videos leading to overfitting and limited generalization.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video-an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video-an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video.txt",
          "section_or_location": "V. CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "Comparative evaluation of three architectures for generating audio from silent video, demonstrating transformer-based approach is most successful for low and mid-frequency audio generation, with deep-fusion suffering discontinuities and Wavenet generating noise-like outputs.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video-an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video-an-initial-exploration-learning-to-generate-realistic-audio-for-silent-video.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model",
      "slug": "akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model",
      "title": "Audio Knowledge Empowered Visual Speech Recognition",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Jeong Hun Yeo",
        "Minsu Kim",
        "Jeongsoo Choi",
        "Dae Hoe Kim",
        "Yong Man Ro"
      ],
      "url": "https://nao-ki-mura.com/paper/akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.07593",
      "arxiv_url": "https://arxiv.org/abs/2308.07593",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:text",
        "task:speech-recognition"
      ],
      "expert_take_short": "The paper advances visual speech recognition by selectively transferring refined linguistic audio knowledge via a learned compact memory and cross-attention injection, improving benchmark WERs over prior audio-assisted methods without requiring audio inputs during inference.",
      "expert_take_long": "This paper presents a strong technical contribution to visual speech recognition by leveraging large-scale pretrained audio models while addressing the key challenge that audio features contain speaker and noise characteristics detrimental to VSR. By vector quantization and clustering, the method isolates linguistic content into discrete memory slots. The ABM uses cross-attention to retrieve matched memory entries per visual frame, allowing purely video-based inference without audio inputs. The effectiveness is demonstrated across LRS2 and LRS3 datasets with up to 433 hours of labeled data and VoxCeleb2 pseudo-label augmentations. The proposed system improves WER from a baseline 46.1% to 41.6% with the BASE model and achieves state-of-the-art levels with LARGE models, outperforming previous methods including AV-HuBERT and RAVEn. Ablations illustrate key design choices, such as the dimensionality of the audio memory, the optimal number of clusters (200), the benefit of removing non-linguistic factors over naive distillation methods, and ABM cross-attention layers (2 layers best). However, the system currently requires offline memory construction before training, and lacks assessments in real-time, streaming, or real-world multi-speaker/noise environments, limiting deployment readiness. The paper strengthens the baseline of audio-empowered VSR by emphasizing selective linguistic knowledge transfer, but remains focused on sentence-level benchmark scenarios.",
      "expert_true_value": "Demonstrates that eliminating non-linguistic factors in audio knowledge transfer and using a trainable discrete memory combined with cross-attention bridging significantly improves VSR performance, offering a more principled and effective audio-to-video knowledge transfer than prior naive distillation or feature concatenation methods.",
      "canon_before": "Existing VSR methods have started to incorporate audio knowledge through distillation or multimodal memory to complement insufficient visual signals, but often retain speaker and noise factors or require audio inputs during inference.",
      "delta_from_canon": "Transforms pretrained audio features into compact discrete memory removing non-linguistic components, then injects this linguistic memory into VSR via ABM cross-attention that operates without audio inputs at training or inference, improving selective audio knowledge transfer over previous audio-assisted VSR methods.",
      "position_in_field": "Strong recent visual speech recognition paper that shows an effective refined audio knowledge transfer that outperforms naive distillation and auxiliary tasks on standard sentence-level benchmarks.",
      "practical_value": "Enhances visual speech-to-text performance by importing linguistic audio features in a compact, selective form without needing audio during inference, useful for lip-reading systems seeking audio-pretraining benefits.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "real-time deployment; preprocessing cost of memory construction; broader interface relevance beyond sentence-level VSR",
      "axes_regressed": "",
      "technical_limits": "Offline compact audio memory construction required before training VSR; increases preprocessing cost and complexity; no real-time or streaming validation; no latency reported.",
      "evaluation_limits": "Evaluations are benchmark WER measures on LRS2 and LRS3 datasets using only video inputs at inference; no live or real-time demonstrations; performance gains demonstrated via multiple ablation studies but no external data or environment tests.",
      "deployment_limits": "Requires offline compact audio memory construction stage prior to VSR training; no evidence for real-time or embedded deployment; limited to sentence-level benchmarks without live camera or latency studies.",
      "scope_limits": "Sentence-level visual speech recognition on public datasets LRS2 and LRS3; no live or streaming experiments; no multi-environment or multi-speaker live deployment tested.",
      "task": "speech-recognition",
      "input_modality": "video (silent lip movement)",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "text",
      "vocabulary_type": "sentence-level open vocabulary",
      "vocabulary_size": "",
      "metrics": "WERs on LRS3: baseline 46.1%, AKVSR-HuBERT 41.6% (BASE) and for LARGE 29.1% (30h), 27.6% (433h), 23.6% (augmented); similar WER improvements on LRS2; comparison against prior SOTA with larger datasets; ablations detailed in Tables III–IX.",
      "evaluation_mode": "Quantitative WER comparison on LRS2 and LRS3 visual speech recognition benchmarks, including multiple ablation experiments over choice of pretrained audio model (CPC, wav2vec2.0, HuBERT), memory cluster size, embedding dimension, ABM cross-attention layers, and training dataset sources.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We propose an Audio Knowledge empowered Visual Speech Recognition framework (AKVSR) which (1) utilizes rich audio knowledge encoded by a large-scale pretrained audio model, (2) saves the linguistic information by discarding non-linguistic information through quantization in compact audio memory, and (3) includes Audio Bridging Module which finds best-matched audio features from compact audio memory for training without audio inputs once the memory is composed.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The proposed method stores refined linguistic audio knowledge in a compact discrete memory obtained by vector quantization from a pretrained audio model (HuBERT) and injects matched memory information into visual features via cross-attention in the Audio Bridging Module (ABM), obviating the need for audio inputs during training or inference.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "section_or_location": "C",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Table I shows the baseline visual speech recognition WER on LRS3 is 46.1%, while AKVSR with HuBERT-based compact audio memory reduces it to 41.6% (BASE Transformer), with further improvements in LARGE models (29.1%, 27.6%, 23.6%) surpassing AV-HuBERT and previous state of the art without language models during inference.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "section_or_location": "27.6%",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Table II on LRS2 confirms the proposed method improves WER over AV-HuBERT across Transformer BASE and LARGE models in low-resource (28h) and high-resource (223h) settings, e.g. from 32.2% to 28.7% WER in LARGE with 28h labeled data and from 25.5% to 24.1% in 223h data.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "section_or_location": "high",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Experiments are performed on the sentence-level LRS2 and LRS3 datasets, including low-resource and high-resource labeled data splits, as well as pseudo-annotated VoxCeleb2 additions to extend training data.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "section_or_location": "resource",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "The method is evaluated strictly on benchmark datasets with WER as metric; there is no real-time or live camera study, and no latency analysis presented.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "section_or_location": "VII. Discussion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The proposed method requires an offline stage to build the compact audio memory before training the VSR model, which adds preprocessing time and complexity, making real-time deployment currently unavailable.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "section_or_location": "VII. Discussion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The evaluation is limited to sentence-level visual speech recognition benchmarks LRS2 and LRS3; there is no demonstration of generalization to real-world conditions, streaming, multi-speaker, or noisy environments.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "section_or_location": "VII. Discussion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The key novelty is explicitly removing speaker and noise characteristics from the pretrained audio model's features via vector quantization and storing linguistic content in discrete clusters, then retrieving matched audio memory contents via cross-attention ABM injecting purely video-conditioned audio knowledge during VSR training and inference.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_akvsr-audio-knowledge-empowered-visual-speech-recognition-by-compressing-audio-knowledge-of-a-pretrained-model-audio-knowledge-empowered-visual-speech-recognition.txt",
          "section_or_location": "C",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface",
      "slug": "knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface",
      "title": "Knowledge Distilled Ensemble Model for sEMG-based Silent Speech Interface",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.06533",
      "arxiv_url": "https://arxiv.org/abs/2308.06533",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "modality:emg",
        "output:text",
        "task:text-entry",
        "evaluation:quantitative",
        "deployment:hands-free",
        "deployment:wearable"
      ],
      "expert_take_short": "This paper delivers a practical spelling-focused sEMG silent speech system by compressing a ResNet ensemble into a lightweight model achieving 85.9% accuracy on the NATO alphabet with portable hardware, but remains limited to 5 young male subjects and speaker-dependent scenarios.",
      "expert_take_long": "This paper presents a practical silent speech interface based on facial sEMG signals captured from three facial muscles, targeting spelling via the NATO phonetic alphabet. It leverages a ResNet1D backbone ensemble model combined via soft voting (VE-ResNet) achieving up to 88% accuracy on a small dataset of 5 young male subjects. To improve deployment practicality, the ensemble is compressed using knowledge distillation into a lightweight KDE-SSI model, maintaining 85.9% accuracy with much smaller model size and faster inference. The authors demonstrate careful preprocessing, data collection, and system design tying COTS hardware, alphabet-level interaction, and model compression. However, the dataset remains limited in size and demographics, evaluation does not include cross-subject generalization or unseen-word tests, and the system remains speaker-dependent with obtrusive adhesive electrodes. The work is a meaningful step toward portable spelling-oriented silent speech interfaces showing a balanced tradeoff between performance and deployability, but substantial evaluation and refinement are needed for broader real-world deployment.",
      "expert_true_value": "The principal contribution is demonstrating that knowledge distillation can compress a strong ensemble silent speech model into a lightweight, low-latency model suitable for practical portable spelling interfaces using the NATO phonetic alphabet over a real 3-channel facial sEMG dataset, addressing the deployment challenges of size and speed while maintaining competitive accuracy.",
      "canon_before": "Prior sEMG SSI research often used small vocabularies or large, complex models with non-portable custom hardware, limiting practical deployment.",
      "delta_from_canon": "Uses NATO phonetic alphabet spelling to enable arbitrary word construction and compresses a 6-model ResNet ensemble into a smaller single KDE-SSI model with negligible accuracy loss, improving portability and latency.",
      "position_in_field": "core sEMG SSI paper",
      "practical_value": "High for portable spelling-based silent speech interface applications where model size and low inference latency are critical.",
      "axes_moved": "portable_text_entry; model_compression; alphabet_scaling",
      "axes_unresolved": "speaker_independence; broader_demographics; continuous_language",
      "axes_regressed": "",
      "technical_limits": "Model requires adhesive electrodes with precise placement on 3 facial muscles; training data limited to 5 male subjects; method currently speaker-dependent; no generalization to broader users or conditions validated.",
      "evaluation_limits": "Evaluation limited to speaker-dependent scenario with 5 young male subjects; no testing on unseen words or cross-session generalization; accuracy evaluated on fixed 4:1:1 train/val/test split.",
      "deployment_limits": "Requires adhesive skin electrodes and controlled quiet seated posture; only tested on 5 young male subjects; speaker-independent use not demonstrated.",
      "scope_limits": "Focuses on spelling interface via NATO alphabet; not continuous speech decoding or broader language recognition.",
      "task": "text-entry",
      "input_modality": "emg",
      "sensor_hardware": "BITalino MuscleBIT bundle with prefixed electrode distances and adhesive Ag/AgCl electrodes on 3 facial muscles (levator anguli oris, depressor anguli oris, zygomaticus major).",
      "body_site": "face",
      "output_type": "text",
      "vocabulary_type": "NATO phonetic alphabet",
      "vocabulary_size": "26 classes",
      "metrics": "KDE-SSI achieved 85.9% accuracy with precision 87.4%, recall 85.7%, and F1-score 0.855 on the 26-class NATO alphabet classification. Ensemble VE-ResNet reached up to 86.0% accuracy. Model sizes were 21.1 MB (KDE-SSI) vs 147.9 MB (VE-ResNet), with inference latency 0.12 ms vs 2.50 ms per sample respectively.",
      "evaluation_mode": "4:1:1 train/validation/test split on whole dataset; experiments compare single ResNet1D, ensemble VE-ResNet, and distilled KDE-SSI models at various ensemble sizes and KD temperatures.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The main contribution of our work is threefold: 1) construct a 26 words NATO phonetic alphabet dataset (3900 data samples in total) from the facial sEMG signals of 5 male subjects; 2) achieve 81.2% test accuracy for the single model on the created dataset; 3) implement a Knowledge Distilled Ensemble Model for Silent Speech Interface (KDE-SSI), a lightweight convolutional network that extracts knowledge from a pre-trained voting ensemble ResNet model (VE-ResNet) while maintaining performance, achieving 85.9% accuracy.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The dataset consists of 3900 samples, collected from 5 male subjects aged 22 to 24; each subject repeated the 26 NATO phonetic alphabet words 30 times, resulting in 150 samples per class, recorded via BITalino MuscleBIT bundle on 3 facial muscles: levator anguli oris, depressor anguli oris, and zygomaticus major.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "section_or_location": "II.DATASET",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "KDE-SSI achieves 85.9% accuracy with 87.4% precision, 85.7% recall and 0.855 F1-score on the 26-class NATO phonetic alphabet classification; VE-ResNet teacher ensemble reaches 86.0% accuracy; inference time and model size of KDE-SSI are 0.12 ms and 21.1 MB versus 2.50 ms and 147.9 MB for VE-ResNet.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "section_or_location": "V.RESULT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The evaluation is limited to 5 young male subjects; only speaker-dependent training and testing are conducted; no cross-subject or cross-session robustness analysis; the system requires adhesive electrodes placed on three facial muscles; does not address speaker independence or broader demographics.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "section_or_location": "II.DATASET",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "The proposed method reduces model size by a factor of 7 and speeds inference by about 20x compared to the teacher ensemble, enabling portable, practical deployment with COTS hardware and low latency, while preserving accuracy.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface-knowledge-distilled-ensemble-model-for-semg-based-silent-speech-interface.txt",
          "section_or_location": "IV.METHOD",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data",
      "slug": "automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data",
      "title": "Automatically measuring speech fluency in people with aphasia: first achievements using read-speech data",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data",
      "doi": "10.1080/02687038.2023.2244728",
      "doi_url": "https://doi.org/10.1080/02687038.2023.2244728",
      "arxiv_id": "2308.04763",
      "arxiv_url": "https://arxiv.org/abs/2308.04763",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "modality:microphone",
        "output:labels",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Strong clinical fluency regression method validated on noisy read speech from aphasia patients; outside core SSI modalities and use-cases.",
      "expert_take_long": "The full text documents a method using forward-backward divergence segmentation on remote read-speech recordings from people with aphasia, clustering speech segments into pseudo-syllables and silent breaks to derive acoustic predictors of fluency. Multiple regression models, validated with leave-one-speaker-out cross-validation against experienced SLP fluency ratings on a 5-point scale, achieve strong correlations (participant-level Pearson r up to 0.96) with root mean squared errors below 0.5 on the scale. This demonstrates a practical, low-resource, objective, and reproducible clinical fluency scoring pipeline using common microphones despite noisy remote recording conditions. The method focuses on read speech and does not tackle spontaneous speech, more complex fluency aspects, speaker independence beyond the tested French cohort, nor does it propose a silent speech interface. Nevertheless, it provides a credible, cost-effective monitoring tool for aphasia assessment workflows, framing fluency assessment as regression over acoustic timing features. Further clinical validation and extension to spontaneous speech are necessary before wider deployment.",
      "expert_true_value": "Provides a solid clinical speech fluency measurement approach robust to home recording noise capable of replacing subjective aphasia fluency ratings, but not an SSI interface or silent speech decoder.",
      "canon_before": "Aphasia fluency scoring is usually subjective, slow, and variable across raters.",
      "delta_from_canon": "Replaces manual fluency judgement with automatic regression models using engineered acoustic fluency predictors robust to noisy home recordings.",
      "position_in_field": "Adjacent clinical speech analytics rather than core SSI research.",
      "practical_value": "Useful for low-cost, objective fluency monitoring in aphasia clinical assessment under realistic noisy home-recording scenarios.",
      "axes_moved": "clinical_measurement; objective_fluency_scoring; noisy_remote_recording",
      "axes_unresolved": "spontaneous_speech; multilingual_transfer; external_validation",
      "axes_regressed": "",
      "technical_limits": "Limited to read-speech scenarios; lacks dedicated modeling for repetitions initially; uses a small hand-crafted predictor set; no spontaneous speech handling or multilingual evaluation.",
      "evaluation_limits": "Limited dataset size (95 recordings from 34 participants), all reading fixed sentences; no evaluation on spontaneous speech or other languages; only three trained SLP raters as ground truth.",
      "deployment_limits": "Requires external clinical validation beyond the French read-speech protocol; limited to read speech; no spontaneous speech evaluation; uses standard PC microphones which may limit clinical integration.",
      "scope_limits": "No silent speech decoding or interfaces; restricted to read speech only; no non-acoustic sensing; limited speaker and language diversity.",
      "task": "speech fluency regression",
      "input_modality": "Acoustic read speech recorded via built-in PC microphones during Zoom calls.",
      "sensor_hardware": "Built-in PC microphones used in Zoom videoconferencing sessions.",
      "body_site": "",
      "output_type": "labels (fluency ratings on a 5-point scale).",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Root-mean-square error (RMSE) as low as 0.47 with added repetition-aware predictor; Pearson correlation coefficients of 0.87 at sentence level and 0.93 to 0.96 at participant level for multiple linear regression and enhanced models.",
      "evaluation_mode": "Leave-one-speaker-out cross-validation predicting speech fluency ratings provided by three expert speech-language pathologists on a 5-point scale at sentence and participant levels.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The main aim of the present study is to investigate whether such signal-processing techniques can be used to predict the speech fluency of PWA, as evaluated by experienced SLPs. As, in aphasiology, subjective judgements of speech fluency have been criticised for their lack of reliability, a first task was to check the consistency of SLP ratings.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "section_or_location": "Aims",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "The dataset contained 95 recorded sentences from 34 participants (29 with aphasia, 5 controls), reading three long sentences from the French Boston Diagnostic Aphasia Examination, recorded remotely over Zoom with built-in PC microphones downsampled to 16 kHz.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "section_or_location": "Materials",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "All models achieved accurate predictions of speech fluency ratings, with average RMSE as low as 0.47 and Pearson correlations up to 0.96 when aggregated per participant, using multivariate regression models including multiple linear regression, support vector regression, and random forest regression with leave-one-speaker-out evaluation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "section_or_location": "Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The proposed forward-backward divergence segmentation algorithm clusters speech into pseudo-syllables and silent breaks; four acoustic predictors (pseudo-syllable rate, speech ratio, rate of silent breaks, and standard deviation of pseudo-syllable length) were extracted and combined in regression models to predict fluency.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "section_or_location": "Materials",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "The algorithm can be rapidly executed on standard PCs and is robust to noisy remote home recordings with ordinary microphones, indicating potential for clinical use in real-world scenarios with low-cost recording setups.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements-using-read-speech-data-automatically-measuring-speech-fluency-in-people-with-aphasia-first-achievements.txt",
          "section_or_location": "Discussion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_exploring-how-a-generative-ai-interprets-music",
      "slug": "exploring-how-a-generative-ai-interprets-music",
      "title": "Exploring how a Generative AI interprets music",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/exploring-how-a-generative-ai-interprets-music",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2308.00015",
      "arxiv_url": "https://arxiv.org/abs/2308.00015",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:audio",
        "evaluation:quantitative"
      ],
      "expert_take_short": "A thorough interpretability analysis reveals that MusicVAE uses only a few dozen latent dimensions to encode music with pitch and rhythm strongly represented in the first two, but the work has no direct relevance to silent speech interfaces.",
      "expert_take_long": "This paper offers an interpretability study of the latent space of Google MusicVAE, a variational autoencoder trained on millions of symbolic monophonic musical sequences. It clearly demonstrates that most of the 512 latent dimensions are effectively noise neurons that contain no music-related information. Instead, only about 37 'music neurons' carry meaningful musical content. Among these, the first two latent dimensions correspond most strongly to human-defined pitch and rhythm features, identified through nonlinear correlation analysis with jSymbolic variables. While melody features appear only in latent dimensions lower down the importance order and only become more independent in the 16-bar case. The study uses datasets of randomly sampled musical sequences and artificial random note sequences to contrast excitation behaviors in latent neurons. However, the scope is limited to symbolic, monophonic MIDI and correlation analyses, with no causal interventions or downstream applications for speech or silent speech interfaces. Its relevance to the SSI domain is negligible, as it neither processes speech nor involves wearable sensors or user control. Overall, it is a solid latent representation analysis with interesting insights into how musical concepts are encoded, but it does not contribute to the SSI literature or deployment-ready systems.",
      "expert_true_value": "Provides detailed latent-space interpretability for symbolic music generative models by isolating key latent dimensions encoding pitch and rhythm, clarifying how complex musical features are compressed, but lacks any SSI application or speech modality.",
      "canon_before": "Generative music models generally have latent spaces that are uninterpretable and lack a clear human-understandable organization.",
      "delta_from_canon": "Identifies that MusicVAE's 512-dimensional latent space primarily uses only a few dozen latent dimensions ('music neurons') to encode actual musical information, with the first two canonical dimensions strongly aligned with pitch and rhythm, and further dimensions loosely with melody for longer sequences.",
      "position_in_field": "Completely outside SSI; purely a latent representation analysis for symbolic music generative modeling.",
      "practical_value": "Useful as a case study for interpretable latent space analysis in generative music AI, with no SSI deployment relevance.",
      "axes_moved": "latent_interpretability; symbolic_music_analysis",
      "axes_unresolved": "causal_neuron_editing; speech_transfer; downstream_usefulness",
      "axes_regressed": "",
      "technical_limits": "Limited to symbolic monophonic music and correlation-based latent analysis; no causal or downstream task evaluation.",
      "evaluation_limits": "Interpretation and conclusions are limited to the Google MusicVAE model's latent space and the symbolic monophonic music dataset used, with no testing on other models or real-world SSI tasks.",
      "deployment_limits": "No deployment system or user-facing application is presented; the work is a latent-space analysis and thus not directly deployable or applicable to SSI devices.",
      "scope_limits": "No speech processing, no articulation or silent speech study, no wearable or interactive sensing, purely symbolic music latent space analysis.",
      "task": "latent-space interpretability",
      "input_modality": "symbolic music sequences (MIDI)",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "latent feature analysis",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Correlation coefficients (nonlinear phik correlations) between latent neuron central values and human-defined symbolic music features; number of activated latent neurons above a threshold; distributions of mean and standard deviation in latent dimensions.",
      "evaluation_mode": "Latent space inspection and nonlinear correlation analysis on samples generated by the model and musical feature extraction using the music21 and jSymbolic libraries.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "We use Google’s MusicVAE, a Variational Auto-Encoder with a 512-dimensional latent space to represent a few bars of music, and organize the latent dimensions according to their relevance in describing music, interpreting pitch, rhythm, and melody in the latent space.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_exploring-how-a-generative-ai-interprets-music-exploring-how-a-generative-ai-interprets-music.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_exploring-how-a-generative-ai-interprets-music-exploring-how-a-generative-ai-interprets-music.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.95,
          "statement": "MusicVAE is trained on about 1.5 million MIDI files filtered for 4/4 time, with 3.8 million monophonic 2-bar sequences and 11.4 million 16-bar sequences extracted, encoding monophonic melodies in a 512-dimensional latent space.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_exploring-how-a-generative-ai-interprets-music-exploring-how-a-generative-ai-interprets-music.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_exploring-how-a-generative-ai-interprets-music-exploring-how-a-generative-ai-interprets-music.txt",
          "section_or_location": "2 \"Twinkle",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.95,
          "statement": "Analysis across 50,000 melodies from the dataset shows that only about 37 latent neurons (\"music neurons\") have small standard deviations and varying means that encode musical information, whereas the remaining 475 neurons behave as \"noise neurons\" with σ≈1 and µ≈0 showing no variation across pieces of music.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_exploring-how-a-generative-ai-interprets-music-exploring-how-a-generative-ai-interprets-music.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_exploring-how-a-generative-ai-interprets-music-exploring-how-a-generative-ai-interprets-music.txt",
          "section_or_location": "3 The structure of MusicVAE’s latent space",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.95,
          "statement": "Correlation analysis using music21 and jSymbolic symbolic music features shows that the first two canonical latent neurons encode most pitch and rhythm information, with nonlinear correlations linking the first neuron strongly with pitch features and the second neuron with rhythm features, while melody features only emerge independently in longer sequences and lower-ranked neurons.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_exploring-how-a-generative-ai-interprets-music-exploring-how-a-generative-ai-interprets-music.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_exploring-how-a-generative-ai-interprets-music-exploring-how-a-generative-ai-interprets-music.txt",
          "section_or_location": "4 Neurons for pitch",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio",
      "slug": "audio-visual-video-to-speech-synthesis-with-synthesized-input-audio",
      "title": "Audio-visual video-to-speech synthesis with synthesized input audio",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Triantafyllos Kefalas",
        "Yannis Panagakis",
        "Maja Pantic"
      ],
      "url": "https://nao-ki-mura.com/paper/audio-visual-video-to-speech-synthesis-with-synthesized-input-audio",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2307.16584",
      "arxiv_url": "https://arxiv.org/abs/2307.16584",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:multimodal",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "The paper credibly shows that incorporating synthesized audio as an auxiliary input in a second-stage audiovisual synthesis model improves video-to-speech reconstruction quality and intelligibility in benchmarks, though gains depend on model variant and dataset.",
      "expert_take_long": "This paper presents a significant advance in video-to-speech synthesis by proposing and evaluating a two-stage approach that treats synthesized audio as an essential intermediate modality to enhance speech reconstruction from silent video. The AV2A architectures append a learnable audio encoder to a pretrained V2A model and are trained with modality dropout to prevent reliance on a single modality. Across benchmark datasets (GRID with 4 and 33 speakers, both seen and unseen, TCD-TIMIT lipspeakers, and LRW), AV2A models, particularly raw waveform V2A-WaveGAN variants with audio pretraining and modality dropout using ground-truth audio during training, consistently improve perceptual and intelligibility metrics (PESQ up to 2.10, STOI up to 0.723, ESTOI up to 0.553), with reductions in WER compared to base V2A models. However, improvements are not universal: some mel-spectrogram models underperform older baselines on certain datasets, and WER gains are modest or fluctuating, especially on unseen speaker splits. The work’s main contribution lies in demonstrating the feasibility and benefits of incorporating synthesized audio in a staged AV2A approach, with novel training methods to balance modalities. This overcomes traditional practices discarding audio at inference, framing synthesized audio as a useful intermediate representation. Nonetheless, limitations remain due to lack of real-time testing, reliance on benchmark datasets under clean conditions, and dependence on the quality of first-stage synthesized audio. The approach is promising for silent-video speech reconstruction research but requires further development and deployment-oriented evaluation for practical use.",
      "expert_true_value": "Demonstrates a novel two-stage audiovisual speech synthesis pipeline that leverages synthesized audio as an intermediate representation, trained with modality dropout, achieving improved objective metrics and enabling robust audiovisual speech reconstruction from silent videos.",
      "canon_before": "Most video-to-speech synthesis systems used either video only or included audio during training but discarded audio input at inference, treating missing audio as unavailable rather than as an explicit intermediate representation.",
      "delta_from_canon": "Introduces a two-stage pipeline leveraging synthesized audio from a first-stage V2A model as explicit input to a second-stage AV2A model, trained with modality dropout to robustly combine video and audio modalities for speech reconstruction.",
      "position_in_field": "A strong contribution to video-to-speech synthesis literature centered on staged audiovisual reconstruction incorporating synthesized audio.",
      "practical_value": "Provides a benchmarked, replicable approach to combining synthesized audio with silent video for improved speech synthesis; relevant for researchers of audiovisual speech synthesis and multimodal enhancement.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "impact of different training variants across datasets; live deployment suitability; correlation with real-time constraints; dependence on synthesized audio quality",
      "axes_regressed": "",
      "technical_limits": "Effectiveness depends on the quality of synthesized audio from the first-stage V2A model; WER improvements are inconsistent across datasets and model variants; no demonstrated results for in-the-wild noisy conditions or real-time inference.",
      "evaluation_limits": "Objective metrics (PESQ, STOI, ESTOI, WER) on benchmark datasets; lack of in-the-wild, noisy environment, or real-time latency evaluations; WER unavailable for TCD-TIMIT due to lack of suitable pre-trained ASR models.",
      "deployment_limits": "Benchmark evaluations only; relies on pretrained base models; no real-time or latency evaluation; no in-the-wild or noisy environment testing; dependency on quality of synthesized audio inputs.",
      "scope_limits": "Applies to video-to-audio reconstruction tasks in benchmark datasets with limited speaker diversity and controlled conditions; does not address noisy or real-time conditions.",
      "task": "speech-reconstruction",
      "input_modality": "Silent video plus synthesized audio",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "Sentence-level benchmark vocabularies",
      "vocabulary_size": "",
      "metrics": "GRID (4 speakers, seen): PESQ 1.95, STOI 0.698, ESTOI 0.532, WER 3.67%; GRID (33 speakers, seen): PESQ 2.10, STOI 0.723, ESTOI 0.553, WER 2.65%; TCD-TIMIT (3 lipspeakers, seen): PESQ 1.44, STOI 0.566, ESTOI 0.411; LRW unseen: WER down to 24.96% in best raw waveform models; metrics reported include PESQ, STOI, ESTOI, WER.",
      "evaluation_mode": "Multi-dataset quantitative benchmark comparison with multiple training variants and modalities in raw waveform and mel-spectrogram domains, including seen and unseen speaker splits.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "In this work we investigate video-to-speech synthesis models that include audio and video inputs during both training and inference. We do so in both the raw waveform and mel spectrogram domains.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "We propose a video-to-speech synthesis approach that employs audio and video inputs during both training and inference by transforming the V2A model into a corresponding AV2A model, using the V2A model to synthesize the input audio, and training under modality dropout.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "section_or_location": "I. INTRODUCTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "We construct the audio-visual-to-audio Generator by appending an audio encoder to the video-to-audio Generator, which receives the synthesized audio as input and concatenates audio and video features with a speaker embedding before feeding to temporal and decoding modules.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "section_or_location": "III. RAW WAVEFORM MODELS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Experiments evaluate the models on GRID in seen speaker splits with 4 and 33 speakers, on unseen GRID with 33 speakers, on TCD-TIMIT lipspeakers, and on LRW in an unseen speaker setting.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "section_or_location": "V. EXPERIMENTAL METHODOLOGY",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On GRID 4-speaker seen split, the best AV2A-WaveGAN model with audio pretraining and modality dropout using ground truth audio achieved PESQ 1.95, STOI 0.698, ESTOI 0.532, and WER 3.67 percent, outperforming base V2A-WaveGAN and other baselines.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "section_or_location": "VI. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On GRID 33-speaker seen split, AV2A-WaveGAN with audio pretraining and modality dropout (GT audio) achieved PESQ 2.10, STOI 0.723, ESTOI 0.553, and WER 2.65, surpassing the base V2A-WaveGAN and several prior methods.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "section_or_location": "VI. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "For TCD-TIMIT with 3 lipspeakers (seen), AV2A-WaveGAN with audio pretraining and modality dropout (GT audio) reached PESQ 1.44, STOI 0.566, ESTOI 0.411.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "section_or_location": "VI. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The paper relies on objective metrics and WER across benchmark corpora, and WER is unavailable for TCD-TIMIT as no accurate pre-trained ASR model could be found.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-visual-video-to-speech-synthesis-with-synthesized-input-audio-audio-visual-video-to-speech-synthesis-with-synthesized-input-audio.txt",
          "section_or_location": "V. EXPERIMENTAL METHODOLOGY",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation",
      "slug": "audio-aware-query-enhanced-transformer-for-audio-visual-segmentation",
      "title": "Audio-aware Query-enhanced Transformer for Audio-Visual Segmentation",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Jinxiang Liu",
        "Chen Ju",
        "Chaofan Ma",
        "Yanfeng Wang",
        "Yu Wang",
        "Ya Zhang"
      ],
      "url": "https://nao-ki-mura.com/paper/audio-aware-query-enhanced-transformer-for-audio-visual-segmentation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2307.13236",
      "arxiv_url": "https://arxiv.org/abs/2307.13236",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "modality:multimodal",
        "modality:video",
        "output:labels"
      ],
      "expert_take_short": "Strong AVS result, outside SSI: the useful idea is audio-conditioned decoder queries plus dynamic mask prediction.",
      "expert_take_long": "The full text makes the contribution tighter than the abstract summary did. The paper is not just 'transformers for AVS'; it targets a specific failure mode where fusion models over-segment salient but silent objects. The evidence is reasonably broad for this task: headline gains on S4 and MS3, successful transfer from S4 pretraining into MS3 where TPAVI degrades, and open-set testing where AuTR still drops but stays materially ahead of the fusion baseline. That is a solid multimodal segmentation result, but it should not be read as silent-speech work.",
      "expert_true_value": "This is a strong AVS architecture paper with credible generalization evidence; it belongs in a multimodal vision-and-audio archive, not as an SSI contribution.",
      "canon_before": "AVS had pixel supervision but still relied on fusion-decoder pipelines that fused audio and vision weakly and often segmented salient silent objects.",
      "delta_from_canon": "The paper makes decoder queries explicitly audio-aware and uses dynamic convolution for instance-specific masks, turning audio guidance into a first-class part of segmentation.",
      "position_in_field": "Competitive AVS model for sounding-object segmentation with stronger open-set and multi-sound behavior than TPAVI.",
      "practical_value": "Useful for multimodal segmentation and audio-guided scene understanding, not for silent speech communication.",
      "axes_moved": "audio guidance; multimodal fusion; open-set generalization",
      "axes_unresolved": "performance beyond AVSBench; robustness to cluttered real-world scenes; deployment efficiency",
      "axes_regressed": "",
      "technical_limits": "The model remains tied to AVSBench-style sounding-object masks and does not address silent speech, language output, or communication use.",
      "evaluation_limits": "Quantitative evidence is concentrated on AVSBench and held-out AVSBench categories; there is no deployment latency or real-world product evaluation.",
      "deployment_limits": "Research segmentation stack only; no claim of SSI or real-time communication deployment is supported.",
      "scope_limits": "Audio-visual segmentation of sounding objects only; outside SSI scope.",
      "task": "audio-visual segmentation",
      "input_modality": "audio + video",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "TABLE I: PVT-v2 reaches 80.4 MJ / .891 MF on S4 and 56.2 MJ / .672 MF on MS3. TABLE II: S4 pretraining then fine-tuning lifts MS3 to 60.95 MJ / .725 MF. TABLE III: on unseen open-set categories PVT-v2 reaches 66.22 MJ / .777 MF versus TPAVI at 55.86 MJ / .719.",
      "evaluation_mode": "AVSBench S4 and MS3 segmentation, S4-to-MS3 fine-tuning, open-set evaluation on held-out categories, and ablation on audio-aware queries and dynamic convolution",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "AuTR introduces audio-aware learnable queries and dynamic convolution so the decoder can focus on sounding objects while suppressing salient silent objects.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation-audio-aware-query-enhanced-transformer-for-audio-visual-segmentation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation-audio-aware-query-enhanced-transformer-for-audio-visual-segmentation.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "With the PVT-v2 backbone, AuTR reports 80.4 MJ / .891 MF on S4 and 56.2 MJ / .672 MF on MS3, beating the TPAVI baseline on both subsets.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation-audio-aware-query-enhanced-transformer-for-audio-visual-segmentation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation-audio-aware-query-enhanced-transformer-for-audio-visual-segmentation.txt",
          "section_or_location": "TABLE I",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.97,
          "statement": "Fine-tuning AuTR from S4 to MS3 improves MJ by 6.59 with ResNet50 and by 4.74 with PVT-v2, while TPAVI degrades under the same transfer setting.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation-audio-aware-query-enhanced-transformer-for-audio-visual-segmentation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation-audio-aware-query-enhanced-transformer-for-audio-visual-segmentation.txt",
          "section_or_location": "B. Performance Improvement for Multiple Sound Sources",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.95,
          "statement": "Open-set performance still drops on unseen categories, with AuTR falling from 77.56 to 66.22 MJ on PVT-v2 even though it remains ahead of TPAVI.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation-audio-aware-query-enhanced-transformer-for-audio-visual-segmentation.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-aware-query-enhanced-transformer-for-audio-visual-segmentation-audio-aware-query-enhanced-transformer-for-audio-visual-segmentation.txt",
          "section_or_location": "C. Open Set Audio Visual Segmentation",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations",
      "slug": "robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations",
      "title": "RobustL2S: Speaker-Specific Lip-to-Speech Synthesis exploiting Self-Supervised Representations",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Neha Sahipjohn",
        "Neil Shah",
        "Vishal Tambrahalli",
        "Vineet Gandhi"
      ],
      "url": "https://nao-ki-mura.com/paper/robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2307.01233",
      "arxiv_url": "https://arxiv.org/abs/2307.01233",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "Strong modular SSL-based lip-to-speech synthesis paper that innovatively maps lip SSL features to disentangled speech embeddings before vocoder synthesis, demonstrating improved intelligibility and robustness across benchmark datasets.",
      "expert_take_long": "RobustL2S convincingly reframes lip-to-speech synthesis by replacing direct mel-spectrogram prediction with a two-stage process using self-supervised representations. The lip encoder (AV-HuBERT) extracts visual features which a non-autoregressive seq2seq model maps to speech SSL features (HuBERT). A speaker-conditioned vocoder then synthesizes speech waveforms from these disentangled speech embeddings. This modular design improves robustness to speaker and ambient variability and boosts intelligibility, demonstrated by STOI, ESTOI, WER, and MOS improvements across three diverse datasets (Lip2Wav, GRID-4S, TCD-TIMIT-3S). While the unconstrained setting is still speaker-dependent and prosody aspects remain limited, the system sets a strong new baseline for SSL-based lip-to-speech synthesis. The main limitation is its missing demonstration in real-time or deployment settings, and limited evaluation on unseen vocabulary or truly speaker-independent scenarios.",
      "expert_true_value": "The key advance is the modular disentanglement using self-supervised speech representations to separate content from speaker and ambient variation, which simplifies learning and improves synthesis intelligibility over prior direct mel regression methods.",
      "canon_before": "Direct mel-spectrogram prediction from lip video entangled with speaker and ambient variation, limiting intelligibility and model efficiency.",
      "delta_from_canon": "The approach uses a two-stage pipeline mapping lip SSL features to speech SSL representations followed by vocoder synthesis, decoupling content from speaker and ambient information.",
      "position_in_field": "A strong 2023 reference for SSL-based lip-to-speech synthesis, useful as a benchmark especially in speaker-dependent unconstrained scenarios.",
      "practical_value": "Useful as a modular high-quality lip-to-speech benchmark for speaker-specific synthesis leveraging state-of-the-art SSL representations.",
      "axes_moved": "ssl_representation_learning; intelligibility; multi_dataset_benchmarking",
      "axes_unresolved": "Prosody, multilingual transfer, and broader speaker generalization remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "Limited prosody modeling due to speech SSL embeddings; speaker-independence and generalization remain unaddressed.",
      "evaluation_limits": "Evaluations are dataset-constrained with no real-world latency or deployment testing; transcripts missing for Lip2Wav requiring whisper for finetuning; limited testing on unseen words and speaker independence.",
      "deployment_limits": "No real-time or on-device deployment demonstrated; evaluation limited to speaker-dependent or seen-speaker scenarios.",
      "scope_limits": "Focuses on lip-to-speech synthesis from silent lip video in speaker-specific or seen-speaker conditions with no claimed generalization to unseen speaker or real-time applications.",
      "task": "speech-reconstruction",
      "input_modality": "video",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "constrained (GRID-4S) and unconstrained (Lip2Wav, TCD-TIMIT-3S)",
      "vocabulary_size": "Lip2Wav speakers > 5000 words; GRID-4S limited to 51 words, TCD-TIMIT-3S includes 375 sentences per speaker.",
      "metrics": "STOI, ESTOI, WER, and Mean Opinion Score (MOS) evaluated on Lip2Wav, GRID-4S, and TCD-TIMIT-3S datasets; e.g., on TCD-TIMIT-3S STOI 0.596, ESTOI 0.452, WER 29.03; on Lip2Wav improvements in STOI/ESTOI up to 0.627/0.419",
      "evaluation_mode": "Objective (STOI, ESTOI, WER) and subjective (MOS) evaluations on standard benchmarks GRID-4S, TCD-TIMIT-3S, and Lip2Wav datasets, under speaker-dependent and constrained settings.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We propose RobustL2S, a modularized framework for Lip-to-Speech synthesis that first maps self-supervised visual features to disentangled speech content representations, then converts these to waveforms with a vocoder.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "RobustL2S explicitly separates the lip encoder extracting AV-HuBERT SSL representations, the non-autoregressive seq2seq mapping to speech HuBERT SSL representations, and the vocoder for waveform synthesis, moving away from direct mel-spectrogram regression.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "section_or_location": "III. METHOD",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On TCD-TIMIT-3S, RobustL2S achieves STOI 0.596, ESTOI 0.452, and WER 29.03; Lip2Wav speaker-dependent results improve STOI and ESTOI up to 0.627 and 0.419 respectively on Deep Learning speaker.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "section_or_location": "V. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "RobustL2S is evaluated on speaker-dependent settings on unconstrained Lip2Wav and constrained GRID-4S and TCD-TIMIT-3S datasets; no claim for real-time or speaker-independent evaluation is made.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The paper notes prosody remains limited due to speech SSL embeddings and strongest results remain speaker-specific; deployment is not addressed for real-time or mobile.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-representations-robustl2s-speaker-specific-lip-to-speech-synthesis-exploiting-self-supervised-re.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models",
      "slug": "diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models",
      "title": "Diff-Foley: Synchronized Video-to-Audio Synthesis with Latent Diffusion Models",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Chuanhao Luo",
        "Chenxu Yan",
        "Hang Hu",
        "et al."
      ],
      "url": "https://nao-ki-mura.com/paper/diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2306.17203",
      "arxiv_url": "https://arxiv.org/abs/2306.17203",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "modality:multimodal",
        "modality:video",
        "output:audio"
      ],
      "expert_take_short": "The real gain is not 'diffusion' alone but aligned conditioning plus guidance that pushes synchronization very hard.",
      "expert_take_long": "The full text shows a more complete system than the earlier draft suggested. CAVP is used to make visual features carry audio-related timing information before generation starts, and double guidance then sharpens sampling quality. The result is strong on the paper's chosen benchmarks: a large jump over SpecVQGAN on IS and a very high alignment accuracy while using only 4 FPS video and fast DPM-Solver sampling. The downstream EPIC-Kitchens section also makes the generalization claim more credible. The main caveat is that this is still a compute-heavy neural Foley stack, not silent speech, and the authors explicitly say billion-scale scaling is untested.",
      "expert_true_value": "Strong neural Foley paper, outside SSI: its main contribution is aligned-feature conditioning for diffusion-based V2A, with unusually good synchronization numbers for the time.",
      "canon_before": "Earlier V2A systems improved semantic relevance but struggled to make generated sounds temporally align with what the video was actually doing.",
      "delta_from_canon": "The paper treats audio-visual alignment as a first-class pretraining objective and pairs it with latent diffusion plus double guidance to raise synchronization and quality together.",
      "position_in_field": "Competitive V2A foundation-style model for synchronized Foley generation.",
      "practical_value": "Relevant for film and video post-production assistance where synchronized environmental audio matters.",
      "axes_moved": "synchronization; audio-visual alignment; diffusion quality",
      "axes_unresolved": "billion-scale training; low-resource adaptation; broader perceptual validation",
      "axes_regressed": "compute cost",
      "technical_limits": "Diffusion remains heavier than GAN baselines, and the paper explicitly says scalability to super-large datasets is still untested.",
      "evaluation_limits": "Main quantitative evidence is on VGGSound, with downstream EPIC-Kitchens evidence mostly qualitative; the work is not evaluated for SSI or human communication outcomes.",
      "deployment_limits": "Promising for offline production assistance, but still too heavy and domain-specific for lightweight deployment claims.",
      "scope_limits": "Video-to-audio Foley synthesis only; outside SSI scope.",
      "task": "video-to-audio generation",
      "input_modality": "video",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Table 1: with double guidance the model reaches IS 62.37, FID 9.87, KL 6.43, Align Acc 94.05, and 0.38s average inference time per sample with DPM-Solver at 25 steps. Table 5: scaling Stage 1 to VGGSound+AudioSet-V2A pushes Align Acc to 94.78 under DDIM evaluation.",
      "evaluation_mode": "VGGSound quantitative evaluation, ablations on guidance and pretraining scale, sampler-speed study, and downstream fine-tuning on EPIC-Kitchens",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "Diff-Foley learns temporally and semantically aligned audio-visual features with CAVP and then conditions a latent diffusion model on those aligned features to synthesize synchronized Foley audio.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models-diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models-diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "Using double guidance, the model reports IS 62.37, FID 9.87, KL 6.43, Align Acc 94.05, and 0.38s average inference time per sample with DPM-Solver at 25 steps.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models-diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models-diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models.txt",
          "section_or_location": "Table 1",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.97,
          "statement": "The paper fine-tunes Stage 2 on EPIC-Kitchens after VGGSound pretraining and reports qualitatively synchronized object-interaction sounds such as knife cutting, water flow, and plate clinking.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models-diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models-diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models.txt",
          "section_or_location": "4.2   Downstream Finetuning",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.96,
          "statement": "The authors explicitly say scalability to super-large datasets remains untested because of limited computation and that diffusion models are slower than GANs.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models-diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models-diff-foley-synchronized-video-to-audio-synthesis-with-latent-diffusion-models.txt",
          "section_or_location": "5     Limitations and Broader Impact",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units",
      "slug": "high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units",
      "title": "High-Quality Automatic Voice Over with Accurate Alignment: Supervision through Self-Supervised Discrete Speech Units",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Junchen Lu",
        "Berrak Sisman",
        "Mingyang Zhang",
        "Haizhou Li"
      ],
      "url": "https://nao-ki-mura.com/paper/high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2306.17005",
      "arxiv_url": "https://arxiv.org/abs/2306.17005",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "evaluation:quantitative"
      ],
      "expert_take_short": "This video-conditioned AVO system innovatively supervises alignment by predicting discrete speech units rather than reconstructing acoustic features, leading to better lip-sync and speech quality on a single-speaker dataset; however, it is not an SSI interface paper.",
      "expert_take_long": "This paper presents DSU-AVO, an automatic voice over system that innovatively supervises the multimodal alignment learning through discrete speech unit prediction derived from self-supervised HuBERT models and clustering. By replacing acoustic feature reconstruction with classification of discrete units at the context level, the approach provides more direct learning signals that yield improved lip-sync accuracy and speech naturalness. The system integrates a pretrained unit vocoder for synthesis conditioned on predicted units, effectively alleviating the mismatch typical in prior acoustic decoding. Experimental evaluation on the single-speaker Chem dataset demonstrates significant gains over baselines in synchronization metrics (LSE-C 6.81 vs. 6.11 for Neural Dubber), duration deviation (FD 3.23 vs. 9.39), intelligibility (WER 24.7% vs. 75.8%), and subjective quality (MOS 3.98 vs. 2.43). Despite these strong contributions, the work is limited to single-speaker settings without multi-speaker or in-the-wild testing and depends on pretrained components, limiting immediate deployment scope. Furthermore, it does not constitute a silent speech interface per se but advances alignment supervision in video-conditioned speech synthesis.",
      "expert_true_value": "The key advance is reframing alignment supervision from acoustic regression to discrete speech unit prediction using self-supervised speech representations, delivering more direct and effective training for lip-synchronized speech generation in automatic voice over.",
      "canon_before": "Prior AVO systems used acoustic feature (mel-spectrogram) reconstruction as a learning objective, which provides indirect supervision for alignment and suffers from mismatch between context and acoustic features.",
      "delta_from_canon": "Replaces acoustic feature regression with discrete speech unit prediction derived via HuBERT and k-means clustering for direct alignment supervision, and uses Unit HiFi-GAN vocoder for synthesis conditioned on predicted units, reducing mismatch and improving synchronization and naturalness.",
      "position_in_field": "Advances automatic voice over and video-conditioned TTS; adjacent but not focused on silent speech interfaces.",
      "practical_value": "Improves synchronization accuracy and speech naturalness for scripted dubbing applications, useful in media production and related domains.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Multi-speaker generalization; expressive prosody; strict SSI relevance",
      "axes_regressed": "",
      "technical_limits": "Single-speaker Chem dataset only; reliant on pretrained unit tokenizer and vocoder; no tests on multi-speaker, unseen words, or moving conditions; contextual mismatch remains an open issue outside studied domain.",
      "evaluation_limits": "Evaluation limited to single-speaker Chem dataset; WER from ASR pretrained on Librispeech only and not adapted; no unseen words or walking tests reported.",
      "deployment_limits": "Requires input of video frames and text scripts; depends on pretrained unit tokenizer and vocoder; only validated on single-speaker Chem dataset; no multi-speaker or in-the-wild testing yet.",
      "scope_limits": "Specific to automatic voice over with text and video inputs; not a generic silent speech interface solution.",
      "task": "automatic voice over",
      "input_modality": "text + video (lip image frames)",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Lip Sync Error Confidence (LSE-C) 6.81, Lip Sync Error Distance (LSE-D) 7.56, Frame Disturbance (FD) 3.23, Word Error Rate (WER) 24.7%, Mean Opinion Score (MOS) 3.98 ± 0.08, Best-Worst Scaling (BWS) best 84.0% / worst 1.3%",
      "evaluation_mode": "Objective metrics (LSE-C, LSE-D, FD, WER) and subjective listening tests (MOS, BWS)",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We propose a novel AVO method leveraging the learning objective of self-supervised discrete speech unit prediction, which not only provides more direct supervision for the alignment learning, but also alleviates the mismatch between the text-video context and acoustic features.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "We propose to guide the context modeling and alignment learning of AVO more directly by imposing discrete speech unit prediction as supervision at the context representation level, replacing mel-spectrogram reconstruction supervision, thus enabling more accurate lip-speech synchronization.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "section_or_location": "We propose to guide the context modeling",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "We utilize Chem dataset, a single-speaker audio-visual English speech dataset, with official transcripts from YouTube, using 6088 training samples, 200 validation, and 200 test samples for evaluation of our AVO systems.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "section_or_location": "4.1.1. Dataset",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "DSU-AVO achieves Lip Sync Error - Confidence (LSE-C) of 6.81, Lip Sync Error - Distance (LSE-D) of 7.56, Frame Disturbance (FD) of 3.23, Word Error Rate (WER) of 24.7%, Mean Opinion Score (MOS) of 3.98 ± 0.08, and Best-Worst Scaling (BWS) for lip-speech synchronization best 84.0% and worst 1.3%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "section_or_location": "4.2. Experimental results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The method is trained and tested only on the single-speaker Chem dataset; it relies on paired video, text, and audio data and pretrained unit tokenizer and vocoder; generalization to multi-speaker, unseen words, and walking conditions is not reported.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "section_or_location": "4.1.1. Dataset",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "DSU-AVO requires text and video frame inputs, pretrained SSL-based discrete unit tokenizer and vocoder models, and has only demonstrated results on single-speaker data, limiting immediate deployment in multi-speaker or diverse dubbing scenarios.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-self-supervised-discrete-speech-units-high-quality-automatic-voice-over-with-accurate-alignment-supervision-through-se.txt",
          "section_or_location": "3.3. DSU",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis",
      "slug": "large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis",
      "title": "Large-scale unsupervised audio pre-training for video-to-speech synthesis",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2306.15464",
      "arxiv_url": "https://arxiv.org/abs/2306.15464",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:speaker-independent"
      ],
      "expert_take_short": "Good decoder-transfer pretraining improves video-to-speech quality on several benchmarks, but WER gains are not consistent. A useful methodological contribution with strong benchmark support, adjacent to SSI rather than a deployable system.",
      "expert_take_long": "This paper methodologically advances video-to-speech synthesis by proposing an audio-to-audio pretraining stage on large audio-only speech corpora, which initializes the decoder in video-to-speech models. The authors design two encoder-decoder models generating either raw waveforms or mel spectrograms and pretrain corresponding audio-to-audio models on 3572 hours of diverse English speech data. Fine-tuning on several benchmarks under seen and unseen speaker conditions shows the pretraining improves perceptual quality and intelligibility metrics in many cases, though not uniformly across all. The approach reframes the data scarcity issue in video-to-speech by leveraging abundant audio-only corpora, a significant departure from canonical paired audio-visual training. Architectural improvements like batch normalization adaptation for cross-modal fine-tuning bolster the transfer. However, the work is a benchmark-method contribution evaluated only with objective offline metrics, lacking latency, robustness, or user studies, and with some inconsistent WER gains. It is valuable for data-efficient video-to-speech reconstruction research adjacent to SSI, rather than a ready interface.",
      "expert_true_value": "A valuable decoder-initialization and pretraining study allowing large audio-only corpora to improve video-to-speech reconstruction quality, advancing data efficiency in cross-modal speech synthesis.",
      "canon_before": "Video-to-speech systems largely rely on paired audio-visual data, limiting the use of large audio-only corpora.",
      "delta_from_canon": "Moves decoder learning into an audio-only pretraining stage and transfers it back into video-to-speech.",
      "position_in_field": "Strong adjacent reconstruction paper.",
      "practical_value": "Useful when paired audio-visual data are scarce but large audio-only speech corpora exist; improves decoder initialization and speech reconstruction quality.",
      "axes_moved": "decoder_pretraining; audio_only_transfer; reconstruction_quality",
      "axes_unresolved": "consistent_wer_gains; live_capture; deployment_latency",
      "axes_regressed": "",
      "technical_limits": "Pretraining does not reliably improve WER uniformly; benchmark audio quality remains modest; model complexity and latency implications not analyzed.",
      "evaluation_limits": "Benchmark evaluation on standard datasets with objective metrics only; no human perceptual tests or deployment contexts evaluated.",
      "deployment_limits": "No latency or real-world capture robustness study is given. No evidence of real-time deployment or joint visual robustness under diverse conditions.",
      "scope_limits": "Focus on video-to-speech waveform and mel spectrogram reconstruction only; no text-entry or command recognition addressed.",
      "task": "speech-reconstruction",
      "input_modality": "video",
      "sensor_hardware": "Video input plus large-scale audio-only corpora used in pretraining; video from mouth region extraction; pretrained face/speaker embeddings for identity",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Objective reconstruction metrics: PESQ (1.26-2.07), STOI (0.49-0.72), ESTOI (0.20-0.53), and WER (2.66% to 42.38%) measured on GRID, TCD-TIMIT, LRW datasets for seen and unseen speaker splits; comparison to prior works such as WGAN and SVTS included; multiple cross-modal fine-tuning strategies evaluated.",
      "evaluation_mode": "Seen and unseen speaker evaluation on GRID, TCD-TIMIT, and LRW datasets, with quantitative metrics (PESQ, STOI, ESTOI, WER)",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "In this paper we propose to train encoder-decoder models on more than 3,500 hours of audio data at 24kHz, and then use the pre-trained decoders to initialize the audio decoders for the video-to-speech synthesis task.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "We pre-train the audio-to-audio models on a combination of speech corpora in an unsupervised manner and use these pre-trained models to initialize and fine-tune the corresponding video-to-audio models.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "section_or_location": "III. VIDEO",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "We propose a modification to the batch normalization module which enables cross-modal parameter fine-tuning by keeping track of separate running statistics for the audio and video modalities. We apply this to the batch normalization layers of the decoder.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "section_or_location": "III. VIDEO",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "We conduct experiments on popular audio-visual datasets on seen (GRID, TCD-TIMIT) and unseen (GRID, LRW) speakers.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "section_or_location": "I. INTRODUCTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On GRID unseen, V2A-MelSpec-S-SP + fine-tuning reaches PESQ 1.43, STOI 0.598, ESTOI 0.335, and WER 17.90; on LRW, V2A-MelSpec-M-SP trained from scratch reaches 1.48, 0.649, 0.484, and 14.96, showing pretraining helps quality but not always WER.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "section_or_location": "VI. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.7,
          "statement": "No latency or real-world capture robustness study is given, so deployment readiness is limited.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "section_or_location": "VII. CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Pretraining does not uniformly improve WER and benchmark audio quality is still modest.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "section_or_location": "VI. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "All evidence is benchmark-based with no live interface or user-facing deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis-large-scale-unsupervised-audio-pre-training-for-video-to-speech-synthesis.txt",
          "section_or_location": "VII. CONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading",
      "slug": "lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading",
      "title": "LipVoicer: Generating Speech from Silent Videos Guided by Lip Reading",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yochai Yemini",
        "Aviv Shamsian",
        "Lior Bracha",
        "Sharon Gannot",
        "Ethan Fetaya"
      ],
      "url": "https://nao-ki-mura.com/paper/lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2306.03258",
      "arxiv_url": "https://arxiv.org/abs/2306.03258",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "Strong full-text paper demonstrating that inference-time text guidance via ASR classifier is key to significantly improved intelligibility in lip-to-speech synthesis on challenging in-the-wild video datasets, outperforming prior baselines.",
      "expert_take_long": "LipVoicer advances lip-to-speech synthesis by decoupling the task into lip-reading and guided diffusion generation, using classifier guidance derived from ASR on inferred text at inference time. This disentangles text content estimation from speech characteristic generation, markedly improving intelligibility and naturalness on challenging, in-the-wild datasets LRS2 and LRS3 compared to recent baselines. Extensive evaluation shows LipVoicer nearly matches ground truth metrics on human and machine assessments, and ablative studies demonstrate that ASR-guided text guidance at inference is critical, dropping WER from ~21% to over 86% without it. The system is modular and allows substitution of lip-reader and ASR for improvements. However, it relies on heavy models and offline generation, limiting deployment on mobile or real-time scenarios, and poses risks of speech manipulation if malicious text is injected. Overall, LipVoicer sets a new standard for intelligible and natural lip-to-speech synthesis on complex data, useful as a reference architecture for future work focused on intelligibility rather than real-time constraints.",
      "expert_true_value": "Shows the crucial role of leveraging text inferred by lip-reading at inference as classifier guidance to overcome intrinsic ambiguity in lip motion, resulting in state-of-the-art intelligibility and naturalness in lip-to-speech synthesis for in-the-wild data.",
      "canon_before": "Prior lip-to-speech systems map silent lip video directly to audio or audio features, often producing ambiguous or unintelligible speech on in-the-wild datasets with diverse speakers and open vocabulary.",
      "delta_from_canon": "LipVoicer incorporates inferred text from a lip-reading network at inference time as guidance for diffusion-based speech generation, reducing ambiguity and improving intelligibility and synchronization.",
      "position_in_field": "A top-tier recent lip-to-speech synthesis achieving near ground-truth intelligibility and naturalness on unconstrained in-the-wild datasets, setting a new state-of-the-art for video-to-speech intelligibility over prior direct regression or unit-based methods.",
      "practical_value": "Highly practical as a reference architecture for generating intelligible speech from silent videos with unconstrained vocabulary in challenging real-world conditions; less suitable for mobile or real-time applications currently.",
      "axes_moved": "text_guidance; intelligibility; in_the_wild_video_to_speech",
      "axes_unresolved": "real-time deployment; misuse mitigation; dependence on lip-reader quality",
      "axes_regressed": "",
      "technical_limits": "Depends on quality and performance of large pretrained lip-reader and ASR models plus heavy diffusion model; generation is offline with hundreds of inference steps limiting real-time use; quality degrades with less accurate lip-readers; risk of adversarial misuse via text injection.",
      "evaluation_limits": "Evaluations are strong on benchmark datasets LRS2 and LRS3 but not aimed at real-time, mobile, or on-device use; reliance on quality lip-reader and ASR.",
      "deployment_limits": "Heavy model stack including lip-reader, ASR, and diffusion; no focus on low-latency or on-device deployment; recognized risk of misuse from malicious text injection.",
      "scope_limits": "Lip-to-speech from silent lip video only; does not handle audio/video noisy inputs or multimodal fusion beyond lip video and inferred text guidance.",
      "task": "speech-reconstruction",
      "input_modality": "silent lip video",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "open",
      "vocabulary_size": "open vocabulary including hundreds of speakers' utterances as in LRS2/LRS3 datasets",
      "metrics": "Human mos scores for intelligibility, naturalness, quality, synchronization with mean and confidence intervals; objective automatic speech recognition word error rate (WER), non-intrusive speech quality and intelligibility DNSMOS and STOI-Net; SyncNet audio-visual synchronization metrics (LSE-C, LSE-D). Exact WER on LRS3 test goes from 21.4% with LipVoicer to 86.2% without ASR guidance; LipVoicer outperforms baselines Lip2Speech, VCA-GAN, SVTS in both human and objective metrics.",
      "evaluation_mode": "Human mean opinion scores (MOS) on intelligibility, naturalness, quality, synchronization; objective metrics including WER, DNSMOS, STOI-Net, SyncNet metrics on LRS2 and LRS3 test sets.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper proposes LipVoicer, which first predicts text from lip video and then uses that text to guide diffusion-based speech generation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "LipVoicer combines lip-reading, diffusion generation, classifier-free guidance, and ASR classifier guidance in one inference loop, a novel system design in lip-to-speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "section_or_location": "4",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "LipVoicer is evaluated on LRS2 and LRS3, in-the-wild audiovisual datasets with hundreds of speakers and open vocabulary; evaluation includes human MOS and objective WER, DNSMOS, STOI-Net, and synchronization with SyncNet metrics.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "section_or_location": "5",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On LRS3 ablation, removing ASR guidance increases WER from 21.4% to 86.2%, showing that text guidance is essential for intelligibility; LipVoicer outperforms baselines Lip2Speech, VCA-GAN, SVTS in human and objective evaluations.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "section_or_location": "5",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The stack depends on a lip-reader, an ASR model, and a relatively heavy diffusion model; inference is not real-time and deployment on mobile or low-latency platforms is not claimed or targeted; misuse risk from injected text at inference is acknowledged.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "section_or_location": "6",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "LipVoicer requires a heavy model stack for inference and does not claim real-time or on-device deployment readiness; social misuse risks from incorrect or malicious text guidance are noted but not addressed here.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "section_or_location": "6",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "LipVoicer achieved human mean opinion scores near ground truth on LRS2 and LRS3 on intelligibility (3.44-3.53 vs 4.33-4.38), naturalness, quality, and synchronization, outperforming recent baselines.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "section_or_location": "5",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "LipVoicer uses pretrained lip-reader Ma et al. (2023) with WER 14.6% on LRS2 and 19.1% on LRS3 to generate text at inference time; uses a modified ASR Burchi & Timofte (2023) fine-tuned on noised data for classifier guidance.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading-lipvoicer-generating-speech-from-silent-videos-guided-by-lip-reading.txt",
          "section_or_location": "5",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_intelligible-lip-to-speech-synthesis-with-speech-units",
      "slug": "intelligible-lip-to-speech-synthesis-with-speech-units",
      "title": "Intelligible Lip-to-Speech Synthesis with Speech Units",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/intelligible-lip-to-speech-synthesis-with-speech-units",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2305.19603",
      "arxiv_url": "https://arxiv.org/abs/2305.19603",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:speaker-independent"
      ],
      "expert_take_short": "Speech units as a pseudo-text target enable strong content supervision that substantially cuts WER without text labels, and the multi-input vocoder improves speech quality from blurry mel outputs, yielding a state-of-the-art lip-to-speech system on LRS benchmarks.",
      "expert_take_long": "This paper presents a significant advancement in lip-to-speech synthesis by combining self-supervised quantized speech units as auxiliary prediction targets with a multi-input vocoder that conditions on both mel-spectrograms and speech units. This architecture allows improved speech content modeling without text labels, a key bottleneck in prior work. Experimental results on LRS3 and LRS2 benchmarks demonstrate clear metric improvements, notably a reduction in word error rate (WER) from 65.8% to 29.8% when using a strong AV-HuBERT visual encoder plus speech units. The vocoder design, augmented during training with blur and noise, further enables intelligible speech synthesis from blurry predicted mel features. Human evaluations confirm gains in naturalness, intelligibility, and clearness relative to prior state of the art. Despite these merits, the work remains benchmark-focused without investigation of real-time operation, mobile compatibility, or robustness to real-world video variations such as occlusion or pose changes. The approach significantly moves the field towards practical lip-to-speech reconstruction by eliminating reliance on text labels, but future work must address deployment and generalization challenges.",
      "expert_true_value": "A strong contribution to video-based speech reconstruction that eliminates the need for paired text transcriptions by leveraging self-supervised speech units, enabling intelligible speech synthesis with improved content fidelity. The multi-input vocoder innovation also materially enhances waveform quality from predicted features.",
      "canon_before": "Lip-to-speech models typically either use blurry acoustic targets alone for supervision or require paired text labels as stronger content guidance.",
      "delta_from_canon": "Uses discrete speech units derived via speech model quantization as pseudo-text supervision to the multi-target L2S model, enabling improved content modeling without text labels; additionally, uses speech units to condition a multi-input vocoder for waveform generation, enhancing intelligibility and speech quality.",
      "position_in_field": "Core video-to-speech / silent-speech reconstruction work utilizing self-supervised speech units for content modeling.",
      "practical_value": "High practical value in constructing lip-to-speech models when paired text labels are unavailable, improving intelligibility significantly on standard benchmarks.",
      "axes_moved": "content_supervision; vocoder_design; reconstruction_intelligibility",
      "axes_unresolved": "live_inference; occlusion_robustness; unconstrained_capture",
      "axes_regressed": "",
      "technical_limits": "Limited by dependence on benchmark video quality, vocoder stack complexity, and residual challenge in matching natural speech quality; no tested robustness to pose, occlusion, or spontaneous environment variability.",
      "evaluation_limits": "Evaluation is limited to test sets from LRS2 and LRS3 datasets, with no cross-dataset or live deployment exploration; human evaluation is limited to 15 participants rating MOS on 20 samples.",
      "deployment_limits": "No latency analysis, on-device implementation, or in-the-wild robustness to camera position/lighting/occlusion is demonstrated, limiting immediate real-world deployment.",
      "scope_limits": "Focuses strictly on lip-to-speech synthesis (speech reconstruction) without addressing text entry, command recognition, or speech enhancement. Evaluation confined to standard benchmarks LRS2 and LRS3.",
      "task": "speech-reconstruction",
      "input_modality": "video",
      "sensor_hardware": "Lip video with visual front-end optionally replaced by AV-HuBERT models pretrained on LRS3 and VoxCeleb2",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Metrics used include STOI, ESTOI, PESQ, and Word Error Rate (WER) on LRS2 and LRS3 datasets, plus Mean Opinion Score (MOS) for naturalness, intelligibility, and clearness based on human raters.",
      "evaluation_mode": "Quantitative evaluation on standard benchmarks (LRS2, LRS3) using STOI, ESTOI, PESQ, WER metrics and a small-scale human Mean Opinion Score (MOS) study.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We propose a novel Lip-to-Speech synthesis (L2S) framework that uses quantized self-supervised speech representations, named speech units, as an additional prediction target alongside mel-spectrogram, allowing strong content supervision without text labels, and introduce a multi-input vocoder conditioned on mel-spectrogram and speech units to generate clear waveforms even from noisy mel-spectrograms.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The main technical advance is treating quantized speech units as pseudo-text supervision during training and conditioning the multi-input vocoder on both mel-spectrogram and these speech units to improve speech content fidelity and waveform quality without relying on text labels.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "section_or_location": "2.2. Multi",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On LRS3 dataset, the proposed method achieves STOI=0.578, ESTOI=0.393, PESQ=1.31, and WER=29.8%, substantially outperforming Multi-Task baseline WER of 65.8%. On LRS2, it reaches STOI=0.585, ESTOI=0.412, PESQ=1.34, and WER=35.7%, again improving over previous methods, as reported in Tables 1 and 2.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "section_or_location": "4.1 Quantitative Comparison",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluation is performed on LRS2 and LRS3 large-scale sentence-level lip-to-speech datasets with thousands of speakers in natural and wild environments; human MOS evaluation involved 15 participants rating naturalness, intelligibility, and clearness on 20 test samples from LRS3.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "section_or_location": "3.1 Datasets",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The approach depends on quality of benchmark video datasets (LRS2 and LRS3) and the vocoder stack complexity; the synthesized speech audio quality remains far from natural speech and no evaluation on real-world deployment scenarios, occlusion robustness, or user mobility has been conducted.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "section_or_location": "4 Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "No latency analysis, on-device deployment, or robustness under uncontrolled camera settings has been demonstrated; thus, deployment readiness for real-world applications remains limited despite benchmark performance gains.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_intelligible-lip-to-speech-synthesis-with-speech-units-intelligible-lip-to-speech-synthesis-with-speech-units.txt",
          "section_or_location": "5 Conclusions",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks",
      "slug": "adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks",
      "title": "Adaptation of Tongue Ultrasound-Based Silent Speech Interfaces Using Spatial Transformer Networks",
      "year": 2023,
      "venue": "the Proceedings of Interspeech 2023",
      "authors": [
        "László Tóth",
        "Amin Honarmandi Shandiz",
        "Gábor Gosztolya",
        "Tamás Gábor Csapó"
      ],
      "url": "https://nao-ki-mura.com/paper/adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks",
      "doi": "10.21437/Interspeech.2023-1607",
      "doi_url": "https://doi.org/10.21437/Interspeech.2023-1607",
      "arxiv_id": "2305.19130",
      "arxiv_url": "https://arxiv.org/abs/2305.19130",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "deployment:speaker-dependent",
        "deployment:wearable",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "Strong full-text-backed evidence that most of the gain comes from fast input alignment, not from inventing a new SSI stack.",
      "expert_take_long": "The paper earns its value by narrowing the adaptation problem. The data section is small but explicit: four Hungarian speakers with 209 read sentences each, plus four extra sessions for one speaker after dismounting and remounting the probe. The experiments then separate STN-only, STN+output-layer, and full retraining. That matters because the headline 88-92% recovery is not 'speaker independence'; it is evidence that affine correction plus a tiny amount of retuning removes most of the adaptation gap in this controlled UTI-to-speech setup.",
      "expert_true_value": "This is a concrete SSI adaptation paper, not a general SSI breakthrough: it shows that a small alignment front-end can recover most cross-session and cross-speaker performance in ultrasound-to-speech regression.",
      "canon_before": "UTI silent-speech models were known to break under probe remounting and speaker mismatch, so adaptation usually meant retraining or collecting more multi-speaker data.",
      "delta_from_canon": "It isolates a large part of the adaptation problem as affine image misalignment and shows that updating the STN plus, optionally, the output layer recovers most of full adaptation.",
      "position_in_field": "Strong ultrasound SSI adaptation result focused on remounting robustness and faster retuning.",
      "practical_value": "Useful when a probe-mounted ultrasound SSI must be recalibrated after remounting or moved to a new speaker without paying full retraining cost.",
      "axes_moved": "speaker adaptation; session adaptation; alignment correction",
      "axes_unresolved": "speaker-independent deployment; open-vocabulary decoding; day-to-day robustness beyond one repeated speaker",
      "axes_regressed": "",
      "technical_limits": "The method only corrects what an affine image transform and a small output-layer update can fix; it does not solve broader speaker-independent SSI or language coverage.",
      "evaluation_limits": "Evidence is limited to four speakers, read Hungarian sentences, one speaker with repeated remounted sessions, and MSE-based regression quality rather than intelligibility or user studies.",
      "deployment_limits": "A probe-mounted ultrasound rig and supervised adaptation data are still required, so this is faster retuning rather than plug-and-play deployment.",
      "scope_limits": "Ultrasound-to-spectrogram SSI adaptation under controlled recording conditions.",
      "task": "speech reconstruction",
      "input_modality": "ultrasound-based silent speech interface",
      "sensor_hardware": "Articulate Instruments Ltd. Micro ultrasound system with probe-fixing headset",
      "body_site": "tongue",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "2D Table 2: STN-only closes 75-76% of the adaptation gap, while STN+out reaches 88% average cross-speaker and 92% cross-session error reduction relative to full adaptation; 3D Table 3 keeps similar relative gains with 87% average improvement for STN+out",
      "evaluation_mode": "Cross-speaker and cross-session adaptation on Hungarian UTI-to-speech conversion with MSE comparisons",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The paper extends UTI-based SSI networks with a spatial transformer network so quick speaker and session adaptation can be done without retraining the whole model.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks-adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-tra.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks-adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-tra.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Evaluation uses four Hungarian speakers with 209 read sentences each, plus four additional remounted sessions from speaker 048 recorded on different days.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks-adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-tra.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks-adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-tra.txt",
          "section_or_location": "4.1. Data Acquisition and Preprocessing",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "In the 2D adaptation study, STN+out recovers 88% of the cross-speaker gap and 92% of the cross-session gap relative to full adaptation, while STN-only recovers about 75-76%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks-adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-tra.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks-adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-tra.txt",
          "section_or_location": "Table 2",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.95,
          "statement": "The paper says future work should test a 3D localization network and smaller amounts of adaptation material, indicating the current evidence remains a controlled adaptation study rather than a broad deployment result.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks-adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-tra.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-transformer-networks-adaptation-of-tongue-ultrasound-based-silent-speech-interfaces-using-spatial-tra.txt",
          "section_or_location": "6. Conclusions",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control",
      "slug": "zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control",
      "title": "Zero-shot personalized lip-to-speech synthesis with face image based voice control",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zheng-Yan Sheng",
        "Yang Ai",
        "Zhen-Hua Ling"
      ],
      "url": "https://nao-ki-mura.com/paper/zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2305.14359",
      "arxiv_url": "https://arxiv.org/abs/2305.14359",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "task:speech-reconstruction",
        "output:speech-audio",
        "deployment:speaker-independent",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Demonstrates effective zero-shot voice control in Lip2Speech by leveraging face image-based speaker embeddings, validated on GRID corpus but constrained by dataset vocabulary and speech naturalness.",
      "expert_take_long": "This work advances lip-to-speech synthesis by achieving zero-shot personalized voice control using face images without reference speech, a step forward from prior methods requiring speech enrollment of unseen speakers. The method employs a VAE to disentangle speaker-independent linguistic content and a face identity encoder trained by cross-modal voice-face representation learning to produce speaker embeddings aligned with voice characteristics. Experimental results on the constrained GRID corpus demonstrate the effectiveness of face-based voice control by comparable speaker similarity (lower EER) and face-voice matching MOS compared to speech-reference-based baselines. However, the approach is currently limited to limited vocabulary, shows some speech quality tradeoffs, and lacks validation on open vocabulary or wild video data. This establishes a baseline for zero-shot personalized Lip2Speech with identity control from face images, inviting further work on scaling to diverse and unconstrained real-world scenarios.",
      "expert_true_value": "The primary scientific contribution is enabling zero-shot speaker identity control for Lip2Speech synthesis without needing any enrollment speech, by learning and applying face image-based speaker embeddings derived from cross-modal knowledge transfer from speech embeddings, which is a new paradigm in silent video speech synthesis.",
      "canon_before": "Multi-speaker Lip2Speech typically depends on speech-based speaker embeddings (SSE) from reference speech, requiring enrollment and thus not supporting zero-shot voice control from silent video alone.",
      "delta_from_canon": "Replaces the need for reference speech embeddings by face-based speaker embeddings trained via cross-modal loss, enabling speaker identity control from only silent video input in a zero-shot setting. Also introduces a VAE to disentangle speaker identity from lip video content in latent representation.",
      "position_in_field": "Advances zero-shot lip-to-speech personalization by cross-modal embedding of face images to voice characteristics, contributing to silent speech interfaces and visual speech reconstruction fields.",
      "practical_value": "Useful for silent video speech reconstruction with speaker identity control without enrollment speech, but currently constrained to limited vocabulary and controlled lab conditions.",
      "axes_moved": "system_design; problem_reframing; evaluation",
      "axes_unresolved": "Open-vocabulary scaling; robustness to pose and in-the-wild video",
      "axes_regressed": "",
      "technical_limits": "Limited by GRID constrained vocabulary; occasional gender mismatches in ablations; speech synthesis quality below natural speech and speaker-reference upper bound; limited robustness and generalization beyond controlled dataset.",
      "evaluation_limits": "Tested only on GRID dataset; 13 unseen speakers evaluated. Objective metrics (STOI, ESTOI, PESQ, EER) and subjective MOS-SN and MOS-FVM evaluated. No explicit unseen word or unconstrained vocabulary testing. No evaluation on mobile or real-time settings.",
      "deployment_limits": "Currently limited to constrained vocabulary of GRID corpus with 6-word sentences. Speech quality remains below ground truth and seen-speaker baselines. Requires large-scale and diverse audiovisual data for real-world application. Not tested for open vocabulary or in-the-wild videos.",
      "scope_limits": "Focuses on lip-to-speech synthesis with speaker identity control under constrained vocabulary (GRID corpus); not for large vocabulary or open-domain visual speech recognition or synthesis.",
      "task": "speech-reconstruction",
      "input_modality": "video (lip-centered frames) plus face image",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "sentence-level utterances constrained by GRID grammar",
      "vocabulary_size": "Constrained GRID grammar with 6-word utterances",
      "metrics": "STOI, ESTOI, PESQ for intelligibility and quality; Equal Error Rate (EER) for speaker similarity; Mean Opinion Score for Speech Naturalness (MOS-SN); Mean Opinion Score for Face-Voice Matching (MOS-FVM) for identity consistency",
      "evaluation_mode": "Objective intelligibility and quality metrics (STOI, ESTOI, PESQ), speaker similarity (EER), and subjective mean opinion scores (MOS-SN for naturalness, MOS-FVM for face-voice matching) on Mechanical Turk ratings.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "In this paper, we propose a zero-shot personalized Lip2Speech synthesis method, in which face images control speaker identities.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "We present a method of associated voice-face representation learning that transfers knowledge from a speech identity encoder to a face identity encoder, enabling face images to control voice characteristics in Lip2Speech synthesis without requiring reference speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "2.2. Associated Voice",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Experiments were conducted on the GRID dataset consisting of 33 speakers with constrained vocabulary utterances. Training used 20 speakers (10 female + 10 male), with 13 speakers held out for unseen testing.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "3.1. Datasets",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Evaluation uses objective metrics STOI, ESTOI, PESQ for intelligibility and naturalness assessment; EER for speaker similarity; and subjective MOS-SN for naturalness and MOS-FVM for face-voice matching to assess identity matching.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "3.4. Evaluation Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The method is limited to the GRID dataset constrained vocabulary setting; synthesis speech quality is below ground truth and seen-speaker upper bound, and is not tested on open vocabulary or in-the-wild video conditions.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "4. CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "Zero-shot personalized Lip2Speech can be achieved from face images without enrollment speech at inference, enabling new assistive applications where speech is unavailable as reference.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "2. PROPOSED METHOD",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The system uses a VAE structure to disentangle speaker identity and linguistic content in video, enabling control of voice characteristics through concatenation of face-based speaker embeddings and latent content codes for decoding speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "2.1. VAE",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.9,
          "statement": "Face identity encoder is trained by multi-losses (cosine similarity, gender contrastive, cross-entropy) to produce face embeddings aligned with speech identity embeddings for voice control at inference stage.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "2.2. Associated Voice",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.95,
          "statement": "Evaluation is conducted with seen and unseen speakers from GRID; subjective tests with MOS-SN and MOS-FVM on Amazon Mechanical Turk with 20 listeners over 20 randomly selected sentences.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "3.4. Evaluation Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.9,
          "statement": "Objective evaluation shows Proposed Speech using reference speech embeddings achieves lower EER, but Proposed using face embeddings achieves comparable intelligibility and quality, validating face-based voice control feasibility.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "3.4. Evaluation Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "Proposed-VAE variant sometimes produces incorrect speaker gender voice, indicating challenges in disentangling speaker identity in the latent space effectively.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "3.4. Evaluation Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.8,
          "statement": "Face-voice matching MOS-FVM is introduced to assess how well synthesized speech matches face image identity, providing a novel subjective evaluation metric for cross-modal identity consistency.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-control-zero-shot-personalized-lip-to-speech-synthesis-with-face-image-based-voice-contr.txt",
          "section_or_location": "3.4. Evaluation Results",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning",
      "slug": "improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning",
      "title": "Improving the Gap in Visual Speech Recognition Between Normal and Silent Speech Based on Metric Learning",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning",
      "doi": "10.21437/Interspeech.2023-370",
      "doi_url": "https://doi.org/10.21437/Interspeech.2023-370",
      "arxiv_id": "2305.14203",
      "arxiv_url": "https://arxiv.org/abs/2305.14203",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "deployment:speaker-independent"
      ],
      "expert_take_short": "Strong viseme-level metric learning approach reduces silent speech VSR errors on a small 10-phrase dataset, notably achieving parity with baselines using much less silent data.",
      "expert_take_long": "This paper addresses the longstanding silent speech gap in visual speech recognition by framing normal and silent speech as viseme distribution alignment problems, employing novel metric learning losses that minimize KL divergence between viseme probability distributions both inter- and intra-speech types. This methodological reframing allows leveraging scarce silent training data more efficiently, as validated on the AV Digits dataset with an augmentation from OuluVS2 normal speech. The key result is that with half the silent data, the method achieves equivalent or better silent speech visual recognition error rates than strong baselines. However, the vocabulary remains tiny (10 fixed phrases), and evaluation is restricted to carefully controlled datasets without real-time or in-the-wild deployment studies. Thus, the contribution is a technically strong, core silent VSR contribution demonstrating successful viseme-level metric learning for cross-speech mode alignment, but its scalability to open vocabulary or larger datasets and deployment robustness is untested.",
      "expert_true_value": "Provides one of the first effective metric learning methods to close the performance gap in silent speech visual recognition by aligning viseme distributions between normal and silent speech, validated with controlled public datasets but still limited in vocabulary scale and real-world robustness assessment.",
      "canon_before": "Silent visual speech recognition performs worse than normal speech VSR due to scarcity of silent data and differing lip dynamics between speaking modes.",
      "delta_from_canon": "Instead of treating normal and silent speech independently, the paper proposes minimizing Kullback-Leibler divergence between predicted viseme distributions across speech types, imposing metric learning regularization that aligns viseme representations between normal and silent speech.",
      "position_in_field": "Core SSI-adjacent visual speech recognition work improving silent speech recognition",
      "practical_value": "Useful for low-data silent phrase recognition scenarios, enabling better silent VSR using limited silent data by leveraging abundant normal speech data through metric learning.",
      "axes_moved": "low_data_transfer; visual_silent_recognition; viseme_alignment",
      "axes_unresolved": "open_vocabulary; in_the_wild_robustness; richer_language_modeling",
      "axes_regressed": "",
      "technical_limits": "Model limited to 10 fixed phrases, using separate visual and language models, relying on text to phoneme to viseme mapping; no end-to-end open vocabulary training or testing; small dataset size limits broader applicability.",
      "evaluation_limits": "Evaluation is limited to AV Digits and OuluVS2 datasets focused on short fixed phrases; no validation on open vocabulary, larger vocabulary, or continuous speech datasets.",
      "deployment_limits": "Current work lacks real-time mobile evaluation and robustness testing in-the-wild; system trained and validated only on small 10-phrase vocabulary datasets and controlled setups.",
      "scope_limits": "Limited to closed vocabulary phrase recognition on small datasets; does not address continuous or open vocabulary speech recognition or real-world conditions.",
      "task": "speech-recognition",
      "input_modality": "video (lip-focused grayscale video frames)",
      "sensor_hardware": "face-aligned lip video captured and cropped using Dlib face landmarking",
      "body_site": "lip",
      "output_type": "text",
      "vocabulary_type": "fixed phrase",
      "vocabulary_size": "10 phrases",
      "metrics": "Viseme Error Rate (VER) and Word Error Rate (WER); best silent speech result: 6.66% VER and 9.97% WER with combined losses including LNCE, LSCE, LWKL, LNKL, LSKL",
      "evaluation_mode": "Viseme Error Rate (VER) and Word Error Rate (WER) on AV Digits silent and normal speech, plus OuluVS2 normal speech augmentation; cross-validation with 39 speakers.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "This paper presents a novel metric learning approach to address the performance gap between normal and silent speech in visual speech recognition (VSR) by leveraging the shared literal content between the two speech types.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The main contribution is a novel metric learning approach based on viseme distribution alignment using Kullback-Leibler divergence to bring normal and silent speech viseme probability distributions close in latent space, enabling better silent speech VSR.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "section_or_location": "3.2. Metric learning for inter",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Viseme Error Rate (VER) and Word Error Rate (WER) metrics were used for evaluation; best silent speech results were 6.66% VER and 9.97% WER achieved by the combined loss LNCE + LSCE + LWKL + LNKL + LSKL.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "section_or_location": "4.4. Experimental results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluation was conducted on the AV Digits dataset with 39 speakers uttering 10 fixed phrases in both normal and silent speech modes, augmented with 1560 normal speech utterances from OuluVS2; the test set includes 550 utterances from 11 speakers.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "section_or_location": "4.1. Database",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The study is limited to a small fixed vocabulary of 10 phrases and uses only controlled datasets (AV Digits and OuluVS2) without any evaluation on open vocabulary, continuous speech, or in-the-wild conditions, restricting generalization and deployment readiness.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "section_or_location": "5. Conclusion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "The approach improves silent speech VSR accuracy with less silent data, achieving parity with baseline trained on twice the silent data, but lacks real-time or in-the-wild mobile deployment evaluations.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-based-on-metric-learning-improving-the-gap-in-visual-speech-recognition-between-normal-and-silent-speech-.txt",
          "section_or_location": "4.4. Experimental results",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_conditional-generation-of-audio-from-video-via-foley-analogies",
      "slug": "conditional-generation-of-audio-from-video-via-foley-analogies",
      "title": "Conditional Generation of Audio from Video via Foley Analogies",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yuexi Du",
        "Ziyang Chen",
        "Justin Salamon",
        "Bryan Russell",
        "Andrew Owens"
      ],
      "url": "https://nao-ki-mura.com/paper/conditional-generation-of-audio-from-video-via-foley-analogies",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2304.08490",
      "arxiv_url": "https://arxiv.org/abs/2304.08490",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "modality:multimodal",
        "modality:video",
        "output:audio"
      ],
      "expert_take_short": "The paper matters because it gives V2A generation a controllable exemplar, not because it beats every timing baseline.",
      "expert_take_long": "The full text supports a more precise reading than the summary-only draft. The important move is the training formulation: two clips from the same source video create self-supervised conditioning pairs, and test-time re-ranking uses a separate sync model to choose from many generations. That yields real control over material/timbre cues, but the quantitative story is mixed. The re-ranked model improves material and action metrics over unconditional variants, yet onset-transfer still wins on synchronization because it is engineered for that subproblem. So the paper is a meaningful controllable Foley result, but not a solved synchronization paper and not SSI work.",
      "expert_true_value": "The durable contribution is controlled neural Foley, not SSI: the paper makes exemplar-conditioned soundtrack generation plausible enough for assistive sound-design workflows, but timing fidelity remains modest.",
      "canon_before": "Prior video-to-audio systems predicted a video's co-occurring sound but gave little artist control over what the result should sound like.",
      "delta_from_canon": "It reframes Foley as analogy-based conditional generation and adds a self-supervised training recipe plus sync-based re-ranking to make exemplar control usable at inference.",
      "position_in_field": "Early strong conditional Foley paper that adds user control to video-to-audio generation.",
      "practical_value": "Relevant for sound-design assistance and interactive Foley prototyping, not for silent-speech decoding.",
      "axes_moved": "user control; timbre conditioning; synchronization reranking",
      "axes_unresolved": "dense overlapping sounds; arbitrary exemplar mismatch; timing precision without re-ranking",
      "axes_regressed": "",
      "technical_limits": "Synchronization is still fragile, generation quality depends on sampling and re-ranking, and the pipeline is aimed at soundtrack synthesis rather than communication.",
      "evaluation_limits": "The strongest quantitative evidence comes from Greatest Hits; CountixAV and wild-video results are qualitative, and onset transfer still wins the synchronization subtask.",
      "deployment_limits": "This is an offline creative tool pipeline rather than a real-time SSI system.",
      "scope_limits": "Conditional Foley generation only; outside SSI scope.",
      "task": "conditional foley generation",
      "input_modality": "video + audio conditioning",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Table 1: Ours w/ re-rank reaches 44.0% overall material accuracy, 66.7% overall action accuracy, 25.3% onset-count accuracy, and 54.3 AP onset synchronization on Greatest Hits. Table 2: in human study, re-ranked outputs are preferred over the base model 54.3% on material and 53.8% on synchronization.",
      "evaluation_mode": "Greatest Hits quantitative evaluation, Amazon Mechanical Turk perceptual study, and qualitative transfer to CountixAV and in-the-wild videos",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The paper formulates conditional Foley as generating a soundtrack for a silent video given a user-supplied audio-visual example that specifies what the video should sound like.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_conditional-generation-of-audio-from-video-via-foley-analogies-conditional-generation-of-audio-from-video-via-foley-analogies.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_conditional-generation-of-audio-from-video-via-foley-analogies-conditional-generation-of-audio-from-video-via-foley-analogies.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.98,
          "statement": "On Greatest Hits, the re-ranked model reaches 44.0% overall material accuracy, 66.7% overall action accuracy, 25.3% onset-count accuracy, and 54.3 AP onset synchronization.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_conditional-generation-of-audio-from-video-via-foley-analogies-conditional-generation-of-audio-from-video-via-foley-analogies.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_conditional-generation-of-audio-from-video-via-foley-analogies-conditional-generation-of-audio-from-video-via-foley-analogies.txt",
          "section_or_location": "Table 1. Automated evaluation metrics.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.97,
          "statement": "The perceptual study used 376 participants on Amazon Mechanical Turk, and the re-ranked variant was preferred over the base model 54.3% for material match and 53.8% for synchronization.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_conditional-generation-of-audio-from-video-via-foley-analogies-conditional-generation-of-audio-from-video-via-foley-analogies.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_conditional-generation-of-audio-from-video-via-foley-analogies-conditional-generation-of-audio-from-video-via-foley-analogies.txt",
          "section_or_location": "Table 2. Perceptual study results.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.95,
          "statement": "The paper notes that onset transfer can outperform the base generative model in perceptual synchronization because it literally copies conditional sounds at impact times, and it warns that this advantage disappears when sounds are not cleanly separable into simple onsets.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_conditional-generation-of-audio-from-video-via-foley-analogies-conditional-generation-of-audio-from-video-via-foley-analogies.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_conditional-generation-of-audio-from-video-via-foley-analogies-conditional-generation-of-audio-from-video-via-foley-analogies.txt",
          "section_or_location": "5. Discussion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training",
      "slug": "speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training",
      "title": "Speech Reconstruction from Silent Tongue and Lip Articulation By Pseudo Target Generation and Domain Adversarial Training",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ruichen Zheng",
        "Yang Ai",
        "Zhenhua Ling"
      ],
      "url": "https://nao-ki-mura.com/paper/speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2304.05574",
      "arxiv_url": "https://arxiv.org/abs/2304.05574",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:oral-cavity",
        "body_site:tongue",
        "evaluation:quantitative",
        "modality:multimodal",
        "modality:ultrasound",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "Strong SSI paper improving silent speech reconstruction by generating pseudo acoustic targets and using domain adversarial training to address domain mismatch; validated with TaL dataset showing substantial WER and MOS gains over TaLNet.",
      "expert_take_long": "This work convincingly addresses two key challenges in silent speech reconstruction: lack of paired acoustic targets and vocalized-silent domain mismatch. By generating pseudo acoustic targets via DTW alignment from paired vocalized articulation data and employing domain adversarial training to produce domain-invariant articulatory feature representations, the authors overcome limitations of prior ultrasound-and-lip based SSI systems trained only on vocalized data. Iterative retraining further refines the model. The experimental results on the TaL dataset validate that these approaches yield substantial improvements in WER, MCD, STOI, and MOS compared to the TaLNet baseline, both in silent and vocalized speaking modes. However, a clear performance gap remains between silent and vocalized modes, and hardware constraints limit deployment readiness. Overall, the paper provides a strong system integration and training methodology contribution advancing articulatory-to-acoustic conversion for silent speech interfaces using multimodal tongue ultrasound and lip video data.",
      "expert_true_value": "The paper's key addition is a novel training approach leveraging pseudo target generation and domain adversarial learning to overcome silent mode data scarcity and domain mismatch in multimodal tongue ultrasound plus lip video speech reconstruction, not just the neural decoder design.",
      "canon_before": "Ultrasound-and-lip reconstruction models trained on vocalized speech perform poorly on silent articulation due to missing acoustic targets and domain mismatch.",
      "delta_from_canon": "Introduces DTW-generated pseudo acoustic targets for silent articulation and domain adversarial training to learn domain-invariant features, combined with iterative retraining to improve silent-mode speech reconstruction.",
      "position_in_field": "A strong core SSI result advancing articulatory-to-acoustic conversion with ultrasound and lip video, addressing silent speech training challenges.",
      "practical_value": "Provides a working approach for silent-mode articulatory speech reconstruction that could benefit clinical or silent communication settings where ultrasound and lip video capture are feasible.",
      "axes_moved": "silent_mode_training; domain_adaptation; multimodal_articulatory_reconstruction",
      "axes_unresolved": "The remaining performance gap between silent and vocalized modes and hardware specialization remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "No ground truth acoustic data for silent mode complicates training; DTW-based pseudo labels may still have alignment noise; silent mode performance lags vocalized; some speakers with poor articulation.",
      "evaluation_limits": "Evaluation restricted to the TaL corpus; results validated on silent and vocalized modes with some speakers excluded due to unreliable articulations.",
      "deployment_limits": "Specialized ultrasound and lip video hardware requirements; no deployment or real-time study provided.",
      "scope_limits": "Articulatory-to-acoustic speech reconstruction from silent tongue and lip articulation only; multimodal ultrasound and optical lip video input in controlled corpus setting.",
      "task": "speech-reconstruction",
      "input_modality": "ultrasound tongue imaging; optical lip video",
      "sensor_hardware": "Ultrasound tongue imaging system; optical lip video camera",
      "body_site": "tongue; lip; oral-cavity",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Silent mode: MCD 3.935 dB, STOI 0.517, WER 43.114%, MOS 3.330; Vocalized mode: WER 17.309%, improvements of ~15% WER and 0.34 MOS in silent mode compared to TaLNet baseline. Metrics from Table 1 of the paper.",
      "evaluation_mode": "Objective metrics (MCD, STOI, WER) via ASR plus subjective MOS testing on silent and vocalized test sets; ablation on iterative training and domain adversarial training.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper proposes pseudo target generation using DTW alignment between vocalized and silent articulations, domain adversarial training to learn robust domain-invariant articulatory representations, and iterative training strategy to improve speech reconstruction from silent tongue and lip articulation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "section_or_location": "3. PROPOSED METHOD",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The paper combines dynamic time warping-based pseudo target generation, domain adversarial training, and iterative retraining tailored for silent articulatory-to-acoustic conversion from multimodal tongue ultrasound and lip video input, addressing domain mismatch and lack of paired acoustic targets for silent speech reconstruction.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "section_or_location": "3. PROPOSED METHOD",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "In silent mode the paper reports MCD 3.935 dB, STOI 0.517, WER 43.114%, and MOS 3.330, improving on TaLNet by about 15 WER points and 0.34 MOS; in vocalized mode WER reduces from 26.890% to 17.309%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "section_or_location": "4.3 Experimental Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "The evaluation is performed on the TaL corpus containing synchronized audio, ultrasound tongue images, and lip videos in vocalized and silent speaking modes from 81 native English speakers, with 1212 paired utterances, in a speaker-independent setting.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "section_or_location": "4. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Silent-mode speech reconstruction performance remains worse than vocalized mode, and some speakers with unreliable silent articulations are excluded; specialized ultrasound and optical lip video hardware constrain deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-generation-and-domain-adversarial-training-speech-reconstruction-from-silent-tongue-and-lip-articulation-by-pseudo-target-g.txt",
          "section_or_location": "4.3 Experimental Results",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions",
      "slug": "wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions",
      "title": "WESPER: Zero-shot and Realtime Whisper to Normal Voice Conversion for Whisper-based Speech Interactions",
      "year": 2023,
      "venue": "Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems (CHI '23), April 23--28, 2023",
      "authors": [
        "Jun Rekimoto"
      ],
      "url": "https://nao-ki-mura.com/paper/wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions",
      "doi": "10.1145/3544548.3580706",
      "doi_url": "https://doi.org/10.1145/3544548.3580706",
      "arxiv_id": "2303.01639",
      "arxiv_url": "https://arxiv.org/abs/2303.01639",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "deployment:real-time",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "Strong whisper-conversion paper, but it remains whisper-based rather than truly silent SSI.",
      "expert_take_long": "WESPER is strong on its own terms. The paper backs the important claims: common speech units reduce the whisper-normal mismatch, the non-autoregressive stack runs in real time, and recognition improves substantially after conversion. The boundary condition is scope, not evidence quality. This is adjacent to SSI because whispered speech still produces audible input, so it should not be sold as a full silent-speech result.",
      "expert_true_value": "The practical contribution is a low-friction whispered-speech conversion stack that can run in real time without per-user paired corpora.",
      "canon_before": "Whisper-to-normal conversion typically required paired whisper-normal corpora or speaker-dependent training, making discreet speech interfaces hard to deploy.",
      "delta_from_canon": "WESPER removes the paired-data requirement by learning common speech units from unpaired whisper and normal speech.",
      "position_in_field": "A strong adjacent paper on discreet speech interaction, but it sits next to SSI rather than inside fully silent speech.",
      "practical_value": "Useful for privacy-preserving whispered interaction and accessibility, especially when a standard microphone is preferred over special sensors.",
      "axes_moved": "discreet_speech; real_time_conversion; speaker_independent_conversion",
      "axes_unresolved": "Full-silence interaction and broader real-world robustness remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "The method still depends on whispered speech rather than truly silent articulation.",
      "evaluation_limits": "The core evidence is on whispered conversion datasets and listening studies, not on silent-speech benchmarks.",
      "deployment_limits": "Although real time is shown, the paper does not establish a full silent interface or broad public-use robustness.",
      "scope_limits": "Whisper-to-normal conversion only; this is not fully silent speech.",
      "task": "whisper-to-normal speech conversion",
      "input_modality": "whispered speech from an ordinary microphone",
      "sensor_hardware": "ordinary microphone",
      "body_site": "",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "On wTIMIT whispers, Google ASR reports WER 44.70 and CER 28.38, while WESPER-converted whispers improve to WER 26.68 and CER 12.70; the HuBERT-base setup pretrained on Librispeech+wTIMIT reaches WER 13.75 and CER 5.47.",
      "evaluation_mode": "MOS, MUSHRA, and speech-recognition evaluation on whispered and converted speech",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract says WESPER is a zero-shot real-time whisper-to-normal conversion mechanism built from a speech-to-unit encoder and a unit-to-speech decoder.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions-wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-bas.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions-wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-bas.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.96,
          "statement": "The introduction states that the encoder and decoder operate in a non-autoregressive manner so the entire system works in real time and does not need per-user training.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions-wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-bas.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions-wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-bas.txt",
          "section_or_location": "1    INTRODUCTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "Table 2 shows Google ASR on whispered wTIMIT at WER 44.70/CER 28.38, while WESPER-converted whispers improve to WER 26.68/CER 12.70; the HuBERT-base setup pretrained on Librispeech+wTIMIT reaches WER 13.75/CER 5.47.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions-wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-bas.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions-wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-bas.txt",
          "section_or_location": "Table 2",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.94,
          "statement": "The introduction explicitly motivates the work with whispered speech, not fully silent articulation, because ordinary microphones can capture whispers whereas silent speech systems often need special sensors.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions-wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-bas.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-based-speech-interactions-wesper-zero-shot-and-realtime-whisper-to-normal-voice-conversion-for-whisper-bas.txt",
          "section_or_location": "1    INTRODUCTION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech",
      "slug": "duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech",
      "title": "Duration-aware pause insertion using pre-trained language model for multi-speaker text-to-speech",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Dong Yang",
        "Tomoki Koriyama",
        "Yuki Saito",
        "Takaaki Saeki",
        "Detai Xin",
        "Hiroshi Saruwatari"
      ],
      "url": "https://nao-ki-mura.com/paper/duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2302.13652",
      "arxiv_url": "https://arxiv.org/abs/2302.13652",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "output:speech-audio",
        "evaluation:quantitative"
      ],
      "expert_take_short": "The paper presents a strong multi-speaker TTS phrasing approach leveraging speaker-conditioned BERT embeddings and pause duration categories to improve pause insertion precision and synthetic speech rhythm; however, it is out-of-scope for SSI as it focuses on audible speech synthesis only.",
      "expert_take_long": "This paper proposes a novel multi-speaker pause insertion framework for TTS that explicitly models speaker-dependent respiratory pause insertion and multi-category pause durations. Leveraging pretrained BERT representations enriched with latent speaker embeddings in BiLSTM decoders, the Respiratory Pause Insertion model alone shows substantial improvement in pause position precision and recall over a conventional baseline. Building on this, the Categorized Pause Insertion model further classifies pauses into brief, medium, and long duration classes for both respiratory and punctuation-induced pauses, enhancing rhythm naturalness as validated through subjective A/B preference tests on synthesized speech using FastSpeech 2. The data underpinning this work is a large-scale English audiobook corpus with over two thousand speakers, aligned text and audio. While objective metric improvements and human preferences support the utility of speaker conditioning and duration-aware phrasing, the scope is limited to English audiobooks with aligned text and does not explore silent articulation or cross-lingual generalization. Deployment relies on obtaining speaker embeddings and aligned corpora, and the approach is demonstrated within a particular TTS architecture. Overall, the paper presents a strong contribution to TTS phrasing emphasizing multi-speaker style adaptation but is peripheral to silent speech interfaces or non-acoustic input paradigms. Future work may explore broader prosodic features and other languages.",
      "expert_true_value": "Demonstrates that explicitly incorporating speaker embeddings and duration categories for respiratory and punctuation-induced pauses significantly improves multi-speaker pause insertion precision and synthetic speech rhythm in TTS systems.",
      "canon_before": "Pause insertion work largely optimized generic phrasing and ignored speaker-specific pause style in multi-speaker corpora.",
      "delta_from_canon": "Adds speaker embeddings and explicit duration-based pause categories, enabling joint optimization of phrasing and pause length conditioned on speaker style.",
      "position_in_field": "A TTS phrasing paper adjacent to SSI only through speech synthesis, not a silent speech interface contribution.",
      "practical_value": "Useful for multi-speaker TTS frontends requiring natural phrasing and rhythm without redesigning synthesis acoustics.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Cross-lingual transfer; broader prosody control; SSI relevance",
      "axes_regressed": "",
      "technical_limits": "Restricted to English audiobook-style data with aligned text and audio; requires speaker embeddings; pause duration categorization thresholds could be further improved; silent speech and multi-lingual scenarios unaddressed.",
      "evaluation_limits": "Objective evaluation limited to pause position and category prediction precision/recall on LibriTTS; subjective A/B rhythm preference tests conducted only on 16 selected speakers and 277 test sentences.",
      "deployment_limits": "Requires aligned text-audio multi-speaker corpora and speaker embeddings; demonstrated only with FastSpeech 2 TTS backend and English audiobook-style data; no testing on conversational or cross-lingual settings.",
      "scope_limits": "Focuses on pause insertion and duration categorization for TTS phrasing; does not address silent articulation or silent speech interfaces.",
      "task": "pause insertion for multi-speaker TTS",
      "input_modality": "text",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "speech-audio",
      "vocabulary_type": "subwords + punctuation",
      "vocabulary_size": "9,098,772 train tokens",
      "metrics": "Respiratory pause precision ~0.569, recall ~0.272, F0.5 ~0.467; Categorized pause insertion respiratory pause precision ~0.575, recall ~0.261, F0.5 ~0.463; categorization of punctuation pauses with precision ~0.848, recall ~0.996, F2 ~0.962; subjective A/B preference tests with 30 listeners and 277 utterances showing consistent rhythm improvement.",
      "evaluation_mode": "Objective metrics on respiratory and punctuation-induced pause detection precision, recall, and F-scores; subjective A/B preference tests of rhythm conducted with 30 listeners using synthetic speech from FastSpeech 2 with HiFi-GAN vocoder.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "Our approach uses bidirectional encoder representations from transformers (BERT) pre-trained on a large-scale text corpus, injecting speaker embeddings to capture various speaker characteristics. We also leverage duration-aware pause insertion for more natural multi-speaker TTS.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "This paper proposes two multi-speaker pause insertion models: the respiratory pause insertion (RPI) model and the categorized pause insertion (CPI) model. The architecture is based on BERT and BiLSTM. The RPI model uses speaker embeddings and predicts respiratory pause positions; the CPI model predicts both respiratory and punctuation-indicated pauses categorized by duration.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "section_or_location": "1. INTRODUCTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The dataset is constructed from LibriTTS, a multi-speaker English corpus derived from audiobooks on the LibriVox website, containing over 400,000 training sentences, more than 2,000 speakers, with detailed pause duration alignment using Montreal Forced Aligner (MFA).",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "section_or_location": "2. DATASET",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The RPI model integrates BERT with two BiLSTM layers decoding BERT output and speaker embeddings, to predict respiratory pause positions. The CPI model extends this with multi-task BiLSTMs predicting probability and category (brief, medium, long) for both respiratory and punctuation-induced pauses.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "section_or_location": "4. PROPOSED METHOD",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "RPI model achieves respiratory pause position prediction precision 0.569, recall 0.272, F0.5 0.467; CPI model achieves respiratory pause precision 0.575, recall 0.261, F0.5 0.463 and punctuation pause position precision 0.848, recall 0.996, F2 0.962.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "section_or_location": "5. EXPERIMENTAL EVALUATIONS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Subjective AB preference tests conducted with 30 native listeners on synthesized speech (FastSpeech 2 with HiFi-GAN) using 277 sentences from 16 speakers showed significant preference for CPI over baseline and RPI models, establishing better rhythm perception linked to categorized pause insertion.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "section_or_location": "5. EXPERIMENTAL EVALUATIONS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "The system demonstration depends on TTS pipeline with aligned text/audio data and speaker embeddings, specifically tested with FastSpeech 2 TTS architecture and English audiobook-style data. Generalization to conversational speech or other languages has not been addressed, limiting deployment readiness.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "section_or_location": "6. CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Limitations include reliance on large-scale English audiobook TTS datasets with aligned pauses; thresholds for pause duration categories leave room for improvement; absence of evaluation on silent articulation or SSI-related applications; and technical limits on generalizing speaker embedding incorporation beyond the tested settings.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speaker-text-to-speech-duration-aware-pause-insertion-using-pre-trained-language-model-for-multi-speake.txt",
          "section_or_location": "6. CONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices",
      "slug": "liplearner-customizable-silent-speech-interactions-on-mobile-devices",
      "title": "LipLearner: Customizable Silent Speech Interactions on Mobile Devices",
      "year": 2023,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zixiong Su",
        "Shitao Fang",
        "Jun Rekimoto"
      ],
      "url": "https://nao-ki-mura.com/paper/liplearner-customizable-silent-speech-interactions-on-mobile-devices",
      "doi": "10.1145/3544548.3581465",
      "doi_url": "https://doi.org/10.1145/3544548.3581465",
      "arxiv_id": "2302.05907",
      "arxiv_url": "https://arxiv.org/abs/2302.05907",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "deployment:mobile-suitable",
        "deployment:real-time",
        "evaluation:quantitative",
        "evaluation:walking-tested",
        "modality:video",
        "output:commands",
        "task:command-recognition"
      ],
      "expert_take_short": "LipLearner is a strong mobile silent speech system that uniquely closes the loop from few-shot lipreading model design to practical on-device customization and keyword spotting, demonstrated robustly in real-world conditions and a user study.",
      "expert_take_long": "LipLearner advances the state-of-the-art in mobile silent speech interfaces by leveraging contrastive pretraining on public lipreading data to extract robust visual speech embeddings suitable for few-shot adaptation. Through a simple linear classifier trained on just a few shots per command, combined with Voice2Lip—a novel automatic annotation method using vocalized speech for registering new silent speech commands—the system allows efficient, practical personalization on commodity smartphones. Their mobile app prototype further integrates a visual keyword spotting mechanism that avoids misactivation common in mouth-opening triggers, and supports on-device incremental learning to refine recognition during usage. The paper thoroughly evaluates the approach on extensive datasets with challenging real-world conditions, showing strong generalizability and robustness, and validates usability in a 16-participant user study with customizable multilingual commands. Remaining challenges include user effort in active learning and disambiguation of similar commands, as well as limited scale relative to open vocabulary applications. Overall, the paper makes a significant contribution in closing the loop from model design to practical mobile silent speech interaction.",
      "expert_true_value": "The key advance is enabling end-to-end on-device silent speech interaction with few-shot customizable command registration via Voice2Lip, practical keyword spotting, and incremental learning, rather than just offline lipreading accuracy. This system design and evaluation close a major gap towards real mobile SSI deployment.",
      "canon_before": "Mobile lipreading interfaces were typically fixed-vocabulary or relied on costly per-user data collection and retraining from scratch, limiting personalization and deployment on commodity devices.",
      "delta_from_canon": "LipLearner replaces training-from-scratch approaches with a few-shot mobile workflow incorporating Voice2Lip for easy command enrollment, a keyword spotting system robust to misactivation, and on-device incremental fine-tuning enabling practical customization.",
      "position_in_field": "One of the clearest mobile SSI papers tying model design directly to practical enrollment and real-time use on commodity devices.",
      "practical_value": "High for enabling privacy-preserving, hands-free silent speech command input on smartphones that users can adapt with minimal effort.",
      "axes_moved": "few_shot_customization; mobile_ssi; keyword_spotting; on_device_learning",
      "axes_unresolved": "open_vocabulary_interaction; lower_effort_active_learning",
      "axes_regressed": "",
      "technical_limits": "Fails on very similar commands causing confusion; requires visible lip movements; incremental learning demands user participation.",
      "evaluation_limits": "Evaluation on 25-command classification, and 30-command live study, both limited scale and vocabulary; testing mainly on frontal lip videos under defined lighting/posture/gesture manipulations; unknown performance on open vocabulary or completely unseen environments beyond tested conditions.",
      "deployment_limits": "User burden remains due to active learning and need for correction when similar commands cause confusion; reliance on visible lips limits use in occluded conditions; current vocabulary and study scale modest relative to open natural language; threshold tuning for keyword spotting remains user-dependent.",
      "scope_limits": "Mobile silent command interaction on smartphone, not general open-vocabulary speech recognition.",
      "task": "command-recognition",
      "input_modality": "video (front camera lip region)",
      "sensor_hardware": "smartphone front camera",
      "body_site": "lip",
      "output_type": "commands",
      "vocabulary_type": "command vocabulary; multilingual user-defined",
      "vocabulary_size": "customizable user-defined with tested 25 to 30 commands",
      "metrics": "One-shot 25-command F1 0.8947; mobile app 30-command one-shot accuracy 81.7%, five-shot 98.8%; keyword spotting average EER 6.75%; on-device latency about 422 ms.",
      "evaluation_mode": "few-shot with cross-condition robustness, keyword spotting evaluated by EER, one-shot F1, user study involving real-time silent speech command issuance and incremental learning",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper proposes customizable mobile silent-speech interaction with few-shot lipreading, Voice2Lip registration, visual keyword spotting, and on-device incremental learning.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "For 25-command classification, F1-score of 0.8947 was achieved with one shot; app study reports 30-command one-shot accuracy 81.7% and five-shot 98.8%; keyword spotting average EER 6.75%; on-device latency about 422 ms.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "LipLearner runs on a commodity smartphone in real-time with latency about 422 ms, all processing on-device supporting hands-free silent speech interaction without network connection.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "section_or_location": "6.2 System Implementation",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluation conducted with LRW public dataset for contrastive pretraining plus an in-the-wild dataset collected with 11 participants in 7 conditions for a total of 9625 clips; a 16-participant live user study tested real-time activation, command registration, and recognition, including multilingual and user-defined commands.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "section_or_location": "4",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Technical limits include reliance on visible lips, confusion between very similar commands, and cognitive and physical user burden in active learning.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_liplearner-customizable-silent-speech-interactions-on-mobile-devices-liplearner-customizable-silent-speech-interactions-on-mobile-devices.txt",
          "section_or_location": "10",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech",
      "slug": "towards-neural-decoding-of-imagined-speech-based-on-spoken-speech",
      "title": "Towards Neural Decoding of Imagined Speech based on Spoken Speech",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Seo-Hyun Lee",
        "Young-Eun Lee",
        "Soowon Kim",
        "Byung-Kwan Ko",
        "Seong-Whan Lee"
      ],
      "url": "https://nao-ki-mura.com/paper/towards-neural-decoding-of-imagined-speech-based-on-spoken-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2212.02047",
      "arxiv_url": "https://arxiv.org/abs/2212.02047",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium-high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "modality:eeg",
        "output:labels",
        "task:speech-recognition",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Transfer of CSP+SVM models trained on spoken speech EEG to imagined speech achieves comparable, though slightly lower, accuracy within a limited 5-class, 7-subject offline EEG setup, with visual imagery control supporting specificity.",
      "expert_take_long": "This work provides an initial quantitative assessment that classifiers trained on spoken speech EEG can transfer to imagined speech decoding with only a modest reduction in accuracy (26.8% transferred vs. 30.5% direct imagined speech, p=0.0983) on a small 5-class EEG dataset from seven subjects. A visual imagery control shows a significant drop in performance when transferred from spoken speech (p=0.022), supporting speech-specific neural feature overlap between spoken and imagined speech. However, the study is limited by small sample size, simple CSP+SVM models, offline analysis, and restricted five-class vocabulary, with no online or cross-subject tests. While promising, these results represent a preliminary baseline for transfer-based imagined speech EEG decoding rather than a fully deployable silent speech interface.",
      "expert_true_value": "The paper establishes a baseline showing that spoken speech EEG models can be transferred to imagined speech decoding with no statistically significant performance drop in a small, offline 5-class EEG task, introducing a novel transfer learning perspective with a visual imagery contrast to demonstrate speech specificity.",
      "canon_before": "Imagined-speech EEG decoding methods typically require direct training on limited imagined-speech data and rarely explore transfer from spoken speech EEG.",
      "delta_from_canon": "The paper reframes imagined speech decoding as a transfer learning task from spoken speech EEG pretraining, with visual imagery decoding serving as a negative control to test speech specificity.",
      "position_in_field": "An adjacent imagined-speech EEG transfer learning study rather than a direct silent speech interface or articulatory speech decoding study.",
      "practical_value": "Provides a useful baseline for imagined speech EEG transfer approaches but lacks immediate applicability as a communication system due to limited vocabulary and offline evaluation.",
      "axes_moved": "Problem framing from direct imagined speech EEG training to cross-paradigm transfer learning; evaluation novelty by contrasting spoken speech transfer with visual imagery control.",
      "axes_unresolved": "Cross-subject generalization, scalability to larger vocabularies, real-time or online decoding performance.",
      "axes_regressed": "",
      "technical_limits": "Limited to offline 5-class classification with CSP+SVM; no deep learning or larger vocabulary; small cohort; no online or cross-subject tests.",
      "evaluation_limits": "Evaluation limited to offline, within-subject 10-fold cross-validation on 7 subjects and 5 classes; no cross-subject or larger-vocabulary validation.",
      "deployment_limits": "No online or real-time decoding implementation, no assistive user studies, no latency or practical deployment analysis.",
      "scope_limits": "Scope limited to preliminary within-subject EEG decoding using CSP+SVM on a 5-class vocabulary, not a comprehensive or scalable imagined speech system.",
      "task": "speech-recognition",
      "input_modality": "eeg",
      "sensor_hardware": "64-channel EEG cap with active electrodes placed according to international 10-10 system, referenced at FCz.",
      "body_site": "brain",
      "output_type": "labels",
      "vocabulary_type": "closed-class word set",
      "vocabulary_size": "5 classes",
      "metrics": "Classification accuracy averaged across 7 subjects: imagined speech direct training 30.5% ± 4.9% vs. spoken speech transfer 26.8% ± 2.0% (p=0.0983); visual imagery direct 31.8% ± 4.1% vs. spoken speech transfer 26.3% ± 2.4% (p=0.022). Statistical tests include Kruskal-Wallis and bootstrap post-hoc analysis.",
      "evaluation_mode": "10-fold cross-validation within subjects; statistical significance tested using Kruskal-Wallis and bootstrap post-hoc analysis.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "In this paper, we performed a preliminary analysis to find out whether if it would be possible to utilize spoken speech electroencephalography data to decode imagined speech, by simply applying the pre-trained model trained with spoken speech brain signals to decode imagined speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "The classification performance of imagined speech data solely used to train and validation was 30.5 ± 4.9 %, the transferred performance of spoken speech based classifier to imagined speech data was 26.8 ± 2.0 %.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "section_or_location": "III. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "For visual imagery, solely trained performance was 31.8 ± 4.1 % and transferred performance of 26.3 ± 2.4 % with statistically significant difference p = 0.022.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "section_or_location": "III. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "The dataset involved 7 subjects and 5-class words/phrases recorded with a 64-channel EEG system.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "section_or_location": "II. MATERIALS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The model used was classical CSP+SVM on 64-channel EEG with only five classes and seven subjects, without online or cross-subject evaluation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "section_or_location": "III. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "No online BCI, assistive-user study, latency analysis, or practical silent communication demonstration was performed.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_towards-neural-decoding-of-imagined-speech-based-on-spoken-speech-towards-neural-decoding-of-imagined-speech-based-on-spoken-speech.txt",
          "section_or_location": "IV. CONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation",
      "slug": "breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation",
      "title": "Breaking the trade-off in personalized speech enhancement with cross-task knowledge distillation",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Hassan Taherian",
        "Seif Emre Eskimez",
        "Takuya Yoshioka"
      ],
      "url": "https://nao-ki-mura.com/paper/breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2211.02944",
      "arxiv_url": "https://arxiv.org/abs/2211.02944",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement"
      ],
      "expert_take_short": "Strong causal PSE paper, not SSI. The pVAD-guided loss is the part that holds up under full-text reading.",
      "expert_take_long": "This paper is best read as a careful training-method paper for causal personalized enhancement. The full text shows the trade-off explicitly in Table 1: adding ITS samples kills leakage in TS3 but worsens over-suppression, and the proposed pVAD-guided losses recover much of that damage. That is a meaningful result for speech enhancement, but it remains adjacent to SSI rather than part of it.",
      "expert_true_value": "The useful contribution is training-time trade-off control for causal PSE, not a new interface or SSI method.",
      "canon_before": "Causal personalized speech enhancement usually reduced either over-suppression or interference leakage, but not both at once.",
      "delta_from_canon": "The pVAD task is used during training to suppress misleading frames from inactive-target scenarios rather than treating every frame equally.",
      "position_in_field": "A solid adjacent speech-enhancement paper, clearly outside core SSI.",
      "practical_value": "Relevant to teleconferencing and target-speaker enhancement pipelines where inactive-target leakage matters.",
      "axes_moved": "leakage_control; over_suppression_management; causal_training",
      "axes_unresolved": "Generalization beyond the simulated scenarios and SSI relevance remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "The work remains acoustic causal PSE and does not extend to silent-speech interaction.",
      "evaluation_limits": "The evidence is scenario-specific to TS1/TS2/TS3 simulations.",
      "deployment_limits": "No user-facing interaction or SSI deployment is claimed.",
      "scope_limits": "Causal personalized speech enhancement only.",
      "task": "personalized speech enhancement",
      "input_modality": "mixed speech audio plus target-speaker conditioning",
      "sensor_hardware": "microphone",
      "body_site": "",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "The proposed S1 model keeps TS1/TS2 over-suppression close to B1 while improving TS3 leakage energy from 46.5 dB to 148.5 dB; in TS2 it changes WER from 16.8 to 17.8 and TSOS from 0.45 to 0.37.",
      "evaluation_mode": "WER, DEL, DNSMOS, STOI, TSOS, and leakage-energy evaluation on TS1/TS2/TS3 scenarios",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract says the paper uses pVAD-based cross-task knowledge distillation to reduce both over-suppression and interference leakage in causal personalized speech enhancement.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation-breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowle.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation-breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowle.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The training section explains that frames misclassified by the pVAD model are excluded or down-weighted so the enhancer is not punished for inactive-target cases in the same way as active-target cases.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation-breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowle.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation-breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowle.txt",
          "section_or_location": "3.2. PSE Training with Cross-task Knowledge Distillation",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "Table 1 and the discussion state that the S1 model keeps over-suppression close to B1 while improving TS3 leakage energy from 46.5 dB to 148.5 dB; in TS2 it changes WER from 16.8 to 17.8 and TSOS from 0.45 to 0.37.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation-breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowle.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation-breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowle.txt",
          "section_or_location": "4.4. Results and Discussions",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.94,
          "statement": "The conclusion scopes the contribution to causal personalized speech enhancement and does not claim any silent-speech interface capability.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation-breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowle.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowledge-distillation-breaking-the-trade-off-in-personalized-speech-enhancement-with-cross-task-knowle.txt",
          "section_or_location": "5. CONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar",
      "slug": "movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar",
      "title": "Movement Detection of Tongue and Related Body Parts Using IR-UWB Radar",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Sunghwa Lee",
        "Younghoon Shin"
      ],
      "url": "https://nao-ki-mura.com/paper/movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar",
      "doi": "10.1109/ICTC55196.2022.9952644",
      "doi_url": "https://doi.org/10.1109/ICTC55196.2022.9952644",
      "arxiv_id": "2209.01762",
      "arxiv_url": "https://arxiv.org/abs/2209.01762",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "deployment:hands-free",
        "evaluation:quantitative",
        "modality:radar",
        "output:labels",
        "task:speech-recognition"
      ],
      "expert_take_short": "Good sensing primitive, very small task.",
      "expert_take_long": "The claim should stay narrow. The experimental section describes only two states: tongue resting on the floor of the mouth and a tongue-tip movement touching the palate before returning. Table I shows the proposed feature extraction plus GMM-HMM reaching 100/90/90/90 accuracy across the four participants, beating both CLEAN-based baselines. That is enough to take radar seriously as a contactless oral-motion sensor. It is not enough to claim word recognition, phoneme decoding, or robust silent-speech interaction because the entire study is four people, two states, short 1-3 second recordings, and manually delimited trials.",
      "expert_true_value": "The full text supports a narrow but real result: radar can detect simple invisible tongue movement states with at least 90% accuracy for each of four participants.",
      "canon_before": "Most SSI sensing work relied on contact sensors, audio, ultrasound, or visible articulators rather than contactless radar pointed under the chin.",
      "delta_from_canon": "The paper strips the problem down to a binary tongue-motion detection task and shows radar can separate the two states without contact.",
      "position_in_field": "Early contactless radar sensing paper for SSI-adjacent tongue-motion detection, not full speech decoding.",
      "practical_value": "Useful as a proof that a non-contact radar sensor can sense hidden oral motion before anyone tries larger-vocabulary SSI on the same hardware.",
      "axes_moved": "sensor modality; contactless oral-motion sensing",
      "axes_unresolved": "continuous articulation; larger vocabularies; user generalization",
      "axes_regressed": "",
      "technical_limits": "Only two states, four participants, manual trial boundaries, and a stationary lab setup are tested.",
      "evaluation_limits": "No continuous speech, no unseen-user split, and no vocabulary-level recognition are reported.",
      "deployment_limits": "The hardware is promising but far from a complete radar SSI system.",
      "scope_limits": "Binary tongue-motion detection only.",
      "task": "speech-recognition",
      "input_modality": "IR-UWB radar pointed at the chin",
      "sensor_hardware": "IR-UWB radar module with LNA evaluation board, sinuous antennas, and dielectric lens",
      "body_site": "tongue; chin",
      "output_type": "labels",
      "vocabulary_type": "binary tongue-state classification",
      "vocabulary_size": "2 classes",
      "metrics": "classification accuracy by participant",
      "evaluation_mode": "leave-one-out cross-validation against CLEAN+MD-DTW baselines on two tongue-motion states",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The abstract states that the study classifies motionless and moving states of an invisible tongue using an IR-UWB radar pointed toward the participant's chin.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar-movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar-movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Section A says the experiment uses four participants, two tongue-motion states, and twenty repetitions of each state per participant.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar-movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar-movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar.txt",
          "section_or_location": "A. Experimental Environment",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.98,
          "statement": "Section B explains that the proposed method turns radar frame sets into envelope features and classifies them with a five-state left-to-right GMM-HMM.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar-movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar-movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar.txt",
          "section_or_location": "B. Feature Extraction and Classifier Selection",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "Table I reports participant-wise classification accuracy of 100, 90, 90, and 90 percent for the proposed feature extraction plus GMM-HMM pipeline.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar-movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar-movement-detection-of-tongue-and-related-body-parts-using-ir-uwb-radar.txt",
          "section_or_location": "TABLE I",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild",
      "slug": "lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild",
      "title": "Lip-to-Speech Synthesis for Arbitrary Speakers in the Wild",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Sindhu B Hegde",
        "K R Prajwal",
        "Rudrabha Mukhopadhyay",
        "Vinay P Namboodiri",
        "C. V. Jawahar"
      ],
      "url": "https://nao-ki-mura.com/paper/lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild",
      "doi": "10.1145/3503161.3548081",
      "doi_url": "https://doi.org/10.1145/3503161.3548081",
      "arxiv_id": "2209.00642",
      "arxiv_url": "https://arxiv.org/abs/2209.00642",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:hands-free",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "The real contribution is not just another VAE-GAN; it is turning lip-to-speech into an arbitrary-speaker problem with credible low-data adaptation.",
      "expert_take_long": "Table 3 is the core evidence. On LRW and LRS2, the model posts the best perceptual metrics in the table, including FDSD/KDSD/LSE-D of 1.638/0.8/8.173 on LRW and 1.273/0.2/8.155 on LRS2, while prior lip-to-speech baselines collapse more badly on LRS2. Table 4 then shows the human side moving the same way, with the proposed model scoring 3.22 intelligibility, 2.98 perceptual quality, 2.28 sync accuracy, and 2.69 voice match, clearly above the listed alternatives. Figure 5 matters too: the multi-speaker pretrain nearly matches the single-speaker baseline with only 25% of the target-speaker data. The limitations section is honest that drastic head motion, non-frontal views, and incorrect word generation remain unresolved.",
      "expert_true_value": "The full text backs a meaningful field move: the model is not best on every raw metric, but it holds up on harder unconstrained datasets and shows that multi-speaker pretraining can nearly match a 20-hour single-speaker model with only 5 hours of adaptation data.",
      "canon_before": "Most lip-to-speech systems were either single-speaker, constrained-lab models or needed much more per-speaker data to work at all.",
      "delta_from_canon": "This paper pushes the task toward arbitrary identities in the wild and argues that distributional modeling plus speaker conditioning is necessary.",
      "position_in_field": "Strong speaker-general lip-to-speech paper focused on unconstrained identity and vocabulary conditions.",
      "practical_value": "Useful when collecting tens of hours per target speaker is unrealistic and the system must generalize across identities.",
      "axes_moved": "speaker generalization; problem framing; evaluation",
      "axes_unresolved": "robustness to head pose; language modeling; live deployment",
      "axes_regressed": "",
      "technical_limits": "The model still struggles with severe head movement, non-frontal heads, and language-level ambiguity that lip motion alone cannot resolve.",
      "evaluation_limits": "The unconstrained evidence is benchmark-based and the human study uses 15 LRS2 samples rated by 20 participants, not a deployment scenario.",
      "deployment_limits": "No live product path, robustness-to-camera-noise study, or interactive latency measurement is reported.",
      "scope_limits": "Arbitrary-speaker lip-to-speech synthesis from video only.",
      "task": "speech-reconstruction",
      "input_modality": "silent lip video",
      "sensor_hardware": "",
      "body_site": "face; lip",
      "output_type": "speech-audio",
      "vocabulary_type": "open vocabulary in the wild",
      "vocabulary_size": "LRW: 500 words; LRS2: 59k vocabulary",
      "metrics": "PESQ; STOI; SED; FDSD; KDSD; LSE-C; LSE-D; human ratings; fine-tuning data-efficiency curves",
      "evaluation_mode": "constrained GRID/TCD-TIMIT benchmarks, unconstrained LRW/LRS2 comparison, human evaluation, and low-data fine-tuning study",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The abstract says the model generates speech from silent lip videos for arbitrary speakers in the wild and is not restricted to a fixed number of speakers.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild-lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild-lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Section 3.3.1 says unconstrained evaluation is built on LRW and the full LRS2 train plus pre-train data, with LRS2 covering thousands of speakers and about 59k vocabulary.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild-lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild-lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild.txt",
          "section_or_location": "3.3.1 Datasets and Training Strategy.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "Table 3 reports the proposed model at 1.273 FDSD, 0.2 KDSD, 2.507 LSE-C, and 8.155 LSE-D on LRS2, outperforming the compared lip-to-speech baselines there.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild-lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild-lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild.txt",
          "section_or_location": "Table 3: All models are pre-trained on LRW dataset and then trained on LRS2.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.98,
          "statement": "The limitations section says the model still struggles with drastic head movement, non-frontal heads, and outputs that do not form the right words or phrases.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild-lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild-lip-to-speech-synthesis-for-arbitrary-speakers-in-the-wild.txt",
          "section_or_location": "6     LIMITATIONS AND FUTURE DIRECTIONS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting",
      "slug": "an-anchor-free-detector-for-continuous-speech-keyword-spotting",
      "title": "An Anchor-Free Detector for Continuous Speech Keyword Spotting",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zhiyuan Zhao",
        "Chuanxin Tang",
        "Chengdong Yao",
        "Chong Luo"
      ],
      "url": "https://nao-ki-mura.com/paper/an-anchor-free-detector-for-continuous-speech-keyword-spotting",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2208.04622",
      "arxiv_url": "https://arxiv.org/abs/2208.04622",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:labels",
        "task:speech-recognition"
      ],
      "expert_take_short": "Strong CSKWS paper, not SSI. The detection framing and unknown class are the points that hold up in full text.",
      "expert_take_long": "The paper is technically solid but belongs outside the SSI core. What the full text shows clearly is that the detection framing matters: adapted classifier baselines keep high trimmed-input accuracy yet fail badly on AP and FRR, while AF-KWS stays fast and sharply improves temporal detection quality. That makes it a useful adjacent benchmark, not a silent-speech interaction result.",
      "expert_true_value": "The real contribution is benchmark plus formulation: continuous keyword spotting behaves like detection, not like ordinary command classification.",
      "canon_before": "Continuous keyword spotting was usually adapted from trigger-word or speech-command classification rather than treated as a detection problem.",
      "delta_from_canon": "AF-KWS turns CSKWS into 1D detection and adds an unknown class so non-keyword words, silence, and noise are modeled explicitly.",
      "position_in_field": "A solid benchmark-and-method paper for acoustic keyword spotting, but outside SSI proper.",
      "practical_value": "Relevant to meeting analysis and spoken keyword retrieval pipelines.",
      "axes_moved": "benchmark_design; detection_formulation; unknown_class_modeling",
      "axes_unresolved": "Open-vocabulary spotting and SSI relevance remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "The work is limited to audio keyword spotting and does not solve SSI problems.",
      "evaluation_limits": "Results are benchmarked on LibriTop-20 and CMAK-style meeting keywords only.",
      "deployment_limits": "The paper is not an interaction-system deployment study.",
      "scope_limits": "Continuous speech keyword spotting only, outside silent-speech interaction.",
      "task": "continuous speech keyword spotting",
      "input_modality": "continuous speech audio",
      "sensor_hardware": "microphone",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "On LibriTop-20, AF-KWS reports AP@5 0.952, AP@75 0.886, mAP 0.860, FRR@5 0.140, FRR@25 0.049, and RTF 0.031, clearly ahead of the adapted classifier baselines.",
      "evaluation_mode": "AP, mAP, FRR, and real-time-factor evaluation on LibriTop-20 and CMAK-7",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract formulates continuous speech keyword spotting as one-dimensional object detection and proposes the anchor-free AF-KWS detector with an auxiliary unknown class.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting-an-anchor-free-detector-for-continuous-speech-keyword-spotting.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting-an-anchor-free-detector-for-continuous-speech-keyword-spotting.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The AF-KWS overview section defines separate heatmap, length, and offset heads and treats non-keyword words, silence, and non-speech as an unknown class.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting-an-anchor-free-detector-for-continuous-speech-keyword-spotting.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting-an-anchor-free-detector-for-continuous-speech-keyword-spotting.txt",
          "section_or_location": "2.1. Overview of AF-KWS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.98,
          "statement": "Table 3 reports AF-KWS at AP@5 0.952, AP@75 0.886, mAP 0.860, FRR@5 0.140, FRR@25 0.049, and RTF 0.031 on LibriTop-20, clearly beating the adapted classifier baselines.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting-an-anchor-free-detector-for-continuous-speech-keyword-spotting.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting-an-anchor-free-detector-for-continuous-speech-keyword-spotting.txt",
          "section_or_location": "Table 3",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.95,
          "statement": "The dataset and conclusion sections scope the contribution to continuous acoustic keyword spotting on LibriTop-20 and CMAK rather than silent-speech interaction.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting-an-anchor-free-detector-for-continuous-speech-keyword-spotting.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_an-anchor-free-detector-for-continuous-speech-keyword-spotting-an-anchor-free-detector-for-continuous-speech-keyword-spotting.txt",
          "section_or_location": "5. Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis",
      "slug": "fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis",
      "title": "FastLTS: Non-Autoregressive End-to-End Unconstrained Lip-to-Speech Synthesis",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yongqi Wang",
        "Zhou Zhao"
      ],
      "url": "https://nao-ki-mura.com/paper/fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis",
      "doi": "10.1145/3503161.3548194",
      "doi_url": "https://doi.org/10.1145/3503161.3548194",
      "arxiv_id": "2207.03800",
      "arxiv_url": "https://arxiv.org/abs/2207.03800",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "This paper matters because it makes unconstrained lip-to-speech materially faster without obviously sacrificing quality.",
      "expert_take_long": "The strongest evidence is joint, not isolated. Table 3 shows FastLTS improving GRID MOS over Lip2Wav from 3.27/3.47/3.54 to 3.59/3.68/3.73 for quality, intelligibility, and naturalness, while Section 5.5 reports 19.76x waveform speedup at a 3-second window. Table 5 also shows the speedup is not bought with GlowLTS-scale bloat: FastLTS uses 50.09M parameters versus 39.87M for Lip2Wav and 85.92M for GlowLTS. The remaining caution is quality headroom: Table 4 gives FastLTS a GRID PESQ of 1.939, which is strong but not the top reported number in that comparison.",
      "expert_true_value": "The full text supports a real systems gain: FastLTS keeps competitive perceptual quality while pushing waveform inference to 19.76x the autoregressive baseline at 3-second input length.",
      "canon_before": "Unconstrained lip-to-speech systems typically predicted mel-spectrograms first and then relied on slow autoregressive or heavy flow-based waveform generation.",
      "delta_from_canon": "FastLTS removes the intermediate spectrogram bottleneck from the main inference path and uses a fully parallelized decoder plus GAN vocoder.",
      "position_in_field": "Strong unconstrained lip-to-speech systems paper centered on latency reduction rather than a new sensing modality.",
      "practical_value": "Useful when lip-to-speech quality matters but the older autoregressive waveform stack is too slow for interactive use.",
      "axes_moved": "system design; model efficiency; evaluation",
      "axes_unresolved": "live deployment; robustness to unconstrained capture artifacts; broader speaker coverage",
      "axes_regressed": "",
      "technical_limits": "Evidence is still limited to benchmark corpora with offline generation windows and three evaluated speakers per dataset slice.",
      "evaluation_limits": "The main quality evidence is subjective MOS plus GRID PESQ; there is no in-the-wild user study or end-to-end conversational latency test.",
      "deployment_limits": "The paper supports lower-latency generation, but not a live camera-to-audio interactive deployment.",
      "scope_limits": "Unconstrained lip-to-speech synthesis from face video only.",
      "task": "speech-reconstruction",
      "input_modality": "silent talking-face video",
      "sensor_hardware": "",
      "body_site": "face; lip",
      "output_type": "speech-audio",
      "vocabulary_type": "large-vocabulary unconstrained speech",
      "vocabulary_size": "Lip2Wav: 5000+ words; GRID: 51 words",
      "metrics": "MOS quality/intelligibility/naturalness; PESQ; waveform acceleration ratio; parameter count",
      "evaluation_mode": "subjective MOS on Lip2Wav and GRID, PESQ on GRID, plus mel and waveform inference-speed comparison",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The abstract presents FastLTS as an end-to-end non-autoregressive lip-to-speech model that directly generates waveform audio from unconstrained talking-face video with low latency.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis-fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis-fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Section 5.1 states that experiments use the Lip2Wav corpus with 5 speakers but evaluate on 3 speakers, and also test adaptation on 3 speakers from the constrained GRID dataset.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis-fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis-fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis.txt",
          "section_or_location": "5.1 Datasets",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "On GRID, FastLTS reports MOS 3.59 for quality, 3.68 for intelligibility, and 3.73 for naturalness, versus 3.27, 3.47, and 3.54 for Lip2Wav.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis-fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis-fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis.txt",
          "section_or_location": "Table 3: MOS on GRID Dataset",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "Section 5.5 reports that waveform synthesis reaches a 19.76x acceleration ratio at 3-second input length.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis-fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis-fastlts-non-autoregressive-end-to-end-unconstrained-lip-to-speech-synthesis.txt",
          "section_or_location": "5.5         Inference Speedup",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks",
      "slug": "improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks",
      "title": "Improved Processing of Ultrasound Tongue Videos by Combining ConvLSTM and 3D Convolutional Networks",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Amin Honarmandi Shandiz",
        "László Tóth"
      ],
      "url": "https://nao-ki-mura.com/paper/improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2206.12947",
      "arxiv_url": "https://arxiv.org/abs/2206.12947",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "task:speech-reconstruction",
        "output:speech-audio",
        "deployment:speaker-dependent",
        "evaluation:quantitative"
      ],
      "expert_take_short": "An empirically supported, incremental advancement showing that hybrid 3D-CNN plus ConvLSTM models modestly outperform prior ultrasound tongue video SSI architectures in mel-spectrogram regression accuracy and model efficiency on single-speaker data.",
      "expert_take_long": "This work contributes a systematic experimental study on neural network architectures for direct articulatory-to-acoustic regression in ultrasound tongue image-based silent speech interfaces. By leveraging a hybrid model that combines early 3D convolutional layers and a top ConvLSTM layer, the authors demonstrate improved mel-spectrogram reconstruction accuracy, model compactness, and faster training compared to earlier 3D-CNN and 3D-CNN+BiLSTM baselines. The dataset consists of a publicly available Hungarian single-speaker corpus, with synchronized ultrasound video at 82 fps and audio at 11 kHz. While the scope is limited to objective spectrogram error metrics without perceptual or cross-speaker validation, the work clarifies the advantages of fusing spatial and temporal information through ConvLSTM in this SSI application. Deployment challenges remain due to hardware and training data constraints, and generalizability is not addressed. Nonetheless, the paper provides valuable architectural insights and empirical validation for ultrasound SSI modeling choices, serving as a practical reference for future ultrasound-based silent speech reconstruction research.",
      "expert_true_value": "This study delivers concrete evidence that integrating ConvLSTM layers atop 3D-CNN feature extractors enhances articulatory-to-speech spectral regression accuracy and reduces network complexity, offering a technically sound and resource-efficient SSI architecture alternative to classical CNN plus sequential LSTM models in ultrasound tongue video speech reconstruction.",
      "canon_before": "Baseline ultrasound tongue SSI models typically combined 2D-CNN with LSTM layers or employed 3D-CNNs, with limited prior use of ConvLSTM architectures, which integrate convolution and temporal gating in one layer and thus preserve spatiotemporal structure more directly but were not widely applied in this domain before.",
      "delta_from_canon": "Replacing the uppermost dense or BiLSTM temporal integration layers of a 3D-CNN architecture with a ConvLSTM layer reduces the model depth and size and improves mel-spectrogram regression accuracy, producing a more compact and accurate network for SSI ultrasound speech reconstruction.",
      "position_in_field": "Focused on comparing temporal feature integration architectures within ultrasound tongue video SSI direct speech reconstruction pipelines",
      "practical_value": "Provides architecture design insights for researchers seeking tradeoffs between accuracy, network size, and training speed in ultrasound tongue SSI systems",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "cross-speaker generalization; intelligibility evaluation; real-world deployment robustness",
      "axes_regressed": "",
      "technical_limits": "Limited dataset from one speaker; absence of perceptual acoustic evaluations; ConvLSTM layers have more parameters requiring filter size adjustments; computations performed offline without real-time deployment tests",
      "evaluation_limits": "Evaluation is limited to a single-speaker Hungarian dataset; the metrics are mean squared error and R2 score on mel-spectrogram regression, without perceptual intelligibility testing or cross-speaker/word generalization analysis.",
      "deployment_limits": "The approach requires access to specialized ultrasound tongue imaging hardware and synchronized audio for training. It was trained on single-speaker data only, limiting generalizability. Real-time deployment performance and robustness to varied environmental factors remain untested.",
      "scope_limits": "Single-speaker articulatory-to-acoustic regression using synchronized ultrasound tongue videos and audio mel-spectrograms; no perceptual, cross-speaker, or robustness evaluations included",
      "task": "Speech reconstruction from ultrasound tongue video sequences",
      "input_modality": "Ultrasound tongue video sequences (82 fps) synchronized with audio (11,025 Hz)",
      "sensor_hardware": "Ultrasound tongue imaging probe (Micro system by Articulate Instruments) with stabilizing headset; Audio-Technica ATR 3350 microphone",
      "body_site": "tongue",
      "output_type": "Speech audio spectra reconstructed via mel-spectrogram regression followed by WaveGlow vocoder synthesis",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Mean squared error (MSE) on mel-spectrogram regression around 0.276; mean R2 regression score approximately 0.73; improvements are incremental but consistent compared to baselines",
      "evaluation_mode": "Objective regression using mean squared error (MSE) and R2 scores on train, development, and test splits for mel-spectrogram frame prediction from ultrasound tongue video input sequences.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The authors claim that a hybrid model combining 3D-CNN and ConvLSTM layers performs better for direct ultrasound tongue video to speech mel-spectrogram mapping accuracy, while being faster and smaller than previous 3D-CNN baselines.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "Replacing the uppermost 3D-CNN and BiLSTM layers with a ConvLSTM layer reduces model depth and size and improves mel-spectrogram regression accuracy, creating an efficient hybrid architecture for ultrasound tongue video SSI reconstruction.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "section_or_location": "4 Experimental Setup",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "The best hybrid 3D-CNN + ConvLSTM model achieved mean squared error around 0.276 and mean R2 score about 0.73 on the test set for mel-spectrogram regression, outperforming 3D-CNN alone and 3D-CNN + BiLSTM baselines.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "section_or_location": "5 Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluations were performed only on a single female Hungarian speaker dataset of 438 sentences split into 310 training, 41 development, and 87 test samples; no cross-speaker, cross-language, or perceptual intelligibility testing was conducted.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "section_or_location": "3 Data acquisition",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The approach is limited by requiring specialized ultrasound hardware, single-speaker training data, lack of perceptual or multi-speaker evaluation, and untested real-time deployment in varied conditions.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "section_or_location": "3 Data acquisition",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "Although not experimentally proven, reducing the network depth via ConvLSTM increases training and inference speed, suggesting potential for faster, more compact models suitable for real-time usage after further validation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-convolutional-networks-improved-processing-of-ultrasound-tongue-videos-by-combining-convlstm-and-3d-con.txt",
          "section_or_location": "5 Results",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection",
      "slug": "visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection",
      "title": "VisageSynTalk: Unseen Speaker Video-to-Speech Synthesis via Speech-Visage Feature Selection",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Joanna Hong",
        "Minsu Kim",
        "Yong Man Ro"
      ],
      "url": "https://nao-ki-mura.com/paper/visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2206.07458",
      "arxiv_url": "https://arxiv.org/abs/2206.07458",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:speaker-independent"
      ],
      "expert_take_short": "The paper is really about disentangling identity, and that is why the unseen-speaker results hold up.",
      "expert_take_long": "The paper presents a novel disentanglement-based approach for video-to-speech synthesis that separates speech content and speaker identity from silent talking-face video using multi-head speech-visage feature selection masks, and synthesizes speech conditioned on these disentangled features. Evaluated on three datasets (GRID, TCD-TIMIT, LRW), the method achieves state-of-the-art objective metrics (STOI, ESTOI, PESQ) on unseen speakers with multi-speaker independent splits, as well as highest subjective MOS scores for naturalness, intelligibility, and voice matching. Ablations confirm the importance of the disentanglement and the multi-head mask design. Speaker verification analysis via EER demonstrates effective separation of identity and content features. Limitations include reliance on aligned, controlled videos, lack of real-time or in-the-wild evaluation, and relatively modest speech quality compared to natural voices. Nonetheless, the work meaningfully advances unseen-speaker video-to-speech robustness via representation learning rather than larger datasets or enrollment procedures.",
      "expert_true_value": "The paper matters because the content-style split is operational, not decorative: it beats prior video-to-speech systems on unseen-speaker GRID and TCD-TIMIT and scales to LRW with a principled feature selection and visage-style conditioned synthesizer architecture.",
      "canon_before": "Lip-to-speech systems improved waveform quality, but most treated face video as a single entangled signal and were weak on unseen speakers.",
      "delta_from_canon": "Makes identity disentanglement explicit through speech-visage feature selection and a visage-style conditioned synthesizer.",
      "position_in_field": "Strong video-to-speech synthesis paper focused on unseen-speaker generalization rather than core SSI hardware.",
      "practical_value": "Useful when face-video synthesis must generalize beyond training identities without a speaker-specific enrollment pipeline.",
      "axes_moved": "system design; evaluation; speaker generalization",
      "axes_unresolved": "in-the-wild robustness; real-time inference; noisy capture conditions",
      "axes_regressed": "",
      "technical_limits": "Even best scores fall short of natural speech; requires clean, aligned talking-face videos; no demonstrated robustness to occlusion or pose variation.",
      "evaluation_limits": "Offline benchmark evaluations only; MOS study limited to 20 GRID samples rated by 16 participants; no in-the-wild tests.",
      "deployment_limits": "No evaluation under camera noise, varying head poses, or real-time inference; limited to aligned, controlled talking-face video.",
      "scope_limits": "Talking-face speech synthesis for unseen speakers with benchmark datasets; no broader SSI application beyond video-to-speech synthesis is addressed.",
      "task": "speech-reconstruction",
      "input_modality": "video",
      "sensor_hardware": "",
      "body_site": "face",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "STOI, ESTOI, PESQ; MOS naturalness, intelligibility, and voice matching; Equal Error Rate (EER) for speaker verification of disentangled features.",
      "evaluation_mode": "Multi-speaker independent and dependent benchmark evaluation plus MOS for naturalness, intelligibility, and voice matching; disentanglement validated via speaker verification EER.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "Disentangles speech content and visage style from silent talking-face video to synthesize speech for unseen speakers.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The model explicitly separates speech-content features from identity-style features before synthesis using multi-head speech-visage feature selection masks.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "section_or_location": "3.1 Speech-visage feature selection",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.95,
          "statement": "On unseen-speaker GRID, the proposed model reaches STOI 0.567, ESTOI 0.308, and PESQ 1.373, improving on End-to-end GAN and VV-Memory. On unseen-speaker TCD-TIMIT it reaches 0.478, 0.217, and 1.410. MOS naturalness 2.96, intelligibility 3.35, voice matching 3.34 on GRID dataset with 20 samples rated by 16 participants.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "section_or_location": "4.3 Experimental results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.9,
          "statement": "Evaluated on GRID corpus, TCD-TIMIT volunteer, and LRW datasets with multi-speaker independent and dependent settings; unseen speaker splits are employed; LRW contains 17,580 clustered speaker identities.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "section_or_location": "4 Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "Best achievable speech quality remains far from natural speech; method depends on high quality talking-face video aligned and cropped; no in-the-wild or real-time study; MOS ratings based on only 20 GRID samples rated by 16 participants.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "section_or_location": "4.3 Experimental results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.8,
          "statement": "No evaluation or discussion of deployment under realistic capture conditions such as camera noise, head pose drift, or occlusion; no latency or live interactive system reported.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature-selection-visagesyntalk-unseen-speaker-video-to-speech-synthesis-via-speech-visage-feature.txt",
          "section_or_location": "4.3 Experimental results",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information",
      "slug": "silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information",
      "title": "Silence is Sweeter Than Speech: Self-Supervised Model Using Silence to Store Speaker Information",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Chi-Luen Feng",
        "Po-chun Hsu",
        "Hung-yi Lee"
      ],
      "url": "https://nao-ki-mura.com/paper/silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2205.03759",
      "arxiv_url": "https://arxiv.org/abs/2205.03759",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:labels",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Strong evidence that silence segments in HuBERT representations uniquely store speaker information, improving SID accuracy when silence is augmented; analytical SSL probing paper outside silent speech interface field.",
      "expert_take_long": "This paper provides mechanistic insight into how SSL speech models, specifically HuBERT and related models, store speaker information preferentially in silence segments within utterances. The authors use a position-aware probing framework by segmenting utterance representations into fragments and learning per-fragment weights for speaker identification downstream tasks. They find a strong correlation between silence ratio and SID accuracy and that silence fragments consistently yield the highest speaker ID performance. Adding silence segments increases SID accuracy by up to about 2% for HuBERT without any fine-tuning of the upstream SSL model. While the insights are novel and well-supported, the work is limited to speaker information on VoxCeleb-like data and does not advance silent speech interface technology or speech reconstruction. Thus, it stands as an insightful analysis paper for SSL speech representation learning but lies outside the core SSI application scope.",
      "expert_true_value": "Demonstrates for the first time that self-supervised speech models localize speaker info in silence fragments, offering new perspective on representation structure and potential augmentation methods for speaker tasks.",
      "canon_before": "SSL speech analysis mostly compared layers or overall models rather than analyzing positional embedding within utterances.",
      "delta_from_canon": "Moves analysis to within-utterance positional granularity and identifies silence fragments as key speaker information carriers.",
      "position_in_field": "Speech SSL analysis outside core silent speech interface research.",
      "practical_value": "Insights can inform speaker representation analysis and data augmentation in SID/ASV pipelines but do not enable silent speech communication.",
      "axes_moved": "evaluation; problem_reframing",
      "axes_unresolved": "Whether other types of speech information localize similarly; generalization beyond tested SSL models and datasets.",
      "axes_regressed": "",
      "technical_limits": "Findings established for specific SSL models and SID probing setups; unknown if causal or generalizable across architectures or languages.",
      "evaluation_limits": "Evaluations limited to HuBERT-family and wav2vec2 SSL models on VoxCeleb data; effect sizes modest especially for stronger models; causal mechanisms not proven beyond correlation.",
      "deployment_limits": "No direct deployment pathway for silent speech interfaces; work is analytical and probing-focused with no real-time or wearable system demonstrated.",
      "scope_limits": "Analyzes only speaker information; does not address other speech contents or silent speech interface modalities.",
      "task": "speaker identification analysis",
      "input_modality": "acoustic (speech audio)",
      "sensor_hardware": "microphone",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "SID accuracy changes quantified, e.g., HuBERT baseline SID accuracy 0.807 increased to 0.824 by adding 1/10 length silence at waveform front, representing approximately 2% absolute gain; silence ratio below 5% reduces SID accuracy by 30-50%.",
      "evaluation_mode": "Probing with fixed upstream SSL models and simple linear downstream speaker identification models.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.9,
          "statement": "Speaker information in HuBERT concentrates around silence, and adding silence can improve SID without finetuning the upstream model.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.95,
          "statement": "By segmenting utterances into 10 fragments and learning weights per fragment, the last fragment (with the highest silence ratio) contributes most to the SID task.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "section_or_location": "3. WHERE IS THE SPEAKER INFORMATION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.9,
          "statement": "Silence fragments consistently yield higher SID and ASV accuracy across HuBERT-Base, HuBERT-1Iter, and HuBERT-Large models regardless of silence insertion position in the waveform (front, middle, end).",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "section_or_location": "Base",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.9,
          "statement": "Adding silence equal to 1/10 of the original waveform length increases HuBERT SID accuracy from 0.807 to 0.824 and HuBERT-Large from 0.890 to 0.892 on VoxCeleb test data, measured without finetuning upstream.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "section_or_location": "4.3. Do Silence Really Important for SSL Models? Yes",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.85,
          "statement": "Findings are limited to specific SSL speech models (HuBERT variants and wav2vec2) and speaker ID probing on VoxCeleb; effect sizes for larger models are modest; causal mechanisms not confirmed.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-speaker-information-silence-is-sweeter-than-speech-self-supervised-model-using-silence-to-store-spea.txt",
          "section_or_location": "6. CONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_svts-scalable-video-to-speech-synthesis",
      "slug": "svts-scalable-video-to-speech-synthesis",
      "title": "SVTS: Scalable Video-to-Speech Synthesis",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Rodrigo Mira",
        "Alexandros Haliassos",
        "Stavros Petridis",
        "Björn W. Schuller",
        "Maja Pantic"
      ],
      "url": "https://nao-ki-mura.com/paper/svts-scalable-video-to-speech-synthesis",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2205.02058",
      "arxiv_url": "https://arxiv.org/abs/2205.02058",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:video",
        "task:speech-reconstruction",
        "output:speech-audio",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:unseen-words"
      ],
      "expert_take_short": "A key scaling contribution that demonstrates simple spectrogram prediction plus pretrained vocoder pipelines outperform prior complex models on diverse datasets, marking foundational progress in large-scale video-to-speech synthesis.",
      "expert_take_long": "This work provides a pragmatically designed and thoroughly evaluated video-to-speech system prioritizing scalability. It employs a clean architectural split—video-to-spectrogram predictor using ResNet18 plus conformers, paired with a pretrained Parallel WaveGAN vocoder—favoring ease of training on large unlabeled audiovisual corpora. The method beats prior art on popular benchmarks like GRID and LRW and achieves intelligible speech on challenging datasets like LRS3, with further improvements shown when scaling training data by incorporating VoxCeleb2. While it does not solve deployment challenges in-the-wild, it marks an important shift in the field toward prioritizing data scale and simplicity rather than complex loss engineering or small-data-specific approaches, providing a strong baseline for large-scale visual silent speech research.",
      "expert_true_value": "Establishes a practical, scalable video-to-speech baseline capable of leveraging large, unconstrained datasets (notably LRS3+VoxCeleb2) with competitive intelligibility, shifting the field from intricate small-dataset loss engineering to data-driven scaling.",
      "canon_before": "Previous video-to-speech methods typically relied on small, constrained datasets, complex loss functions, and architectures that scaled poorly to large, diverse datasets.",
      "delta_from_canon": "Shifts towards simpler, scalable architectures using a two-stage spectrogram prediction plus pretrained vocoder pipeline; demonstrates results on large, less constrained datasets such as LRW and especially LRS3.",
      "position_in_field": "Foundational large-scale video-to-speech paper, demonstrating scalability and strong baseline for lip-based silent speech reconstruction relevant to SSI modalities.",
      "practical_value": "Provides a reliable, scalable video-to-speech system useful as a baseline for future research and a blueprint emphasizing data scale over complex losses; useful especially for those seeking to develop large-scale silent speech interfaces.",
      "axes_moved": "system_design; deployment; evaluation",
      "axes_unresolved": "In-the-wild robustness; higher fidelity on unconstrained data",
      "axes_regressed": "",
      "technical_limits": "Dependent on large curated audiovisual datasets; lower fidelity and intelligibility on unseen speakers and unconstrained conditions; no demonstration of robustness to in-the-wild variation; ASR evaluation limitations on complex datasets.",
      "evaluation_limits": "No user studies or open-vocabulary ASR-based WER for LRS3 due to ASR unreliability on generated audio; evaluation focused on seen and unseen speakers splits on standard datasets.",
      "deployment_limits": "Requires reasonably cropped, aligned lip video and substantial, curated audiovisual training data; no unconstrained deployment validation yet.",
      "scope_limits": "Limited to lip-video to speech reconstruction, does not address cross-modal or other silent speech modalities.",
      "task": "speech-reconstruction",
      "input_modality": "video",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "word- and sentence-level audiovisual speech",
      "vocabulary_size": "GRID (51 words), LRW (500 words), LRS3 (>50,000 words), LRS3+VoxCeleb2 (~1,500+ hours data)",
      "metrics": "PESQ, STOI, ESTOI, and WER evaluated on generated speech with pretrained ASR models; GRID and LRW WER reported, but no WER for LRS3 due to ASR unreliability; vocoder speed measured in clips/second; loss function ablations measured by same metrics.",
      "evaluation_mode": "Quantitative benchmark evaluations and ablation studies including vocoder and loss comparisons.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We introduce a scalable video-to-speech framework consisting of two components: a video-to-spectrogram predictor and a pre-trained neural vocoder, which converts the mel-frequency spectrograms into waveform audio, achieving state-of-the-art results on GRID and LRW, and intelligible results on LRS3.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "Our model uses a simpler, scalable predictor/vocoder split, with a ResNet18+conformer video-to-spectrogram predictor trained on scaled training data, combined with a pretrained Parallel WaveGAN vocoder; this enables efficient and scalable training unlike more complex architectures or bespoke loss engineering of prior work.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "section_or_location": "2. Methodology",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "We evaluate on the small GRID dataset (27 hours), medium-scale LRW (around 150 hours), the challenging large-scale LRS3 corpus (312 hours), and a combined LRS3+VoxCeleb2 corpus exceeding 1,500 hours, with seen and unseen speaker splits where applicable.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "section_or_location": "3. Experimental setup",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Quantitative evaluation metrics include PESQ, STOI, ESTOI for speech quality and intelligibility assessment, as well as WER measured using pretrained ASR models on generated audio for GRID and LRW; WER on LRS3 was not reported due to ASR unreliability on generated samples.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "section_or_location": "3.4. Evaluation metrics",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Parallel WaveGAN neural vocoder outperforms Griffin-Lim and other neural vocoders on all evaluation metrics and runs significantly faster (around 54.7 clips/sec on GRID with Nvidia RTX 2080 Ti) enabling efficient waveform synthesis during inference.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "section_or_location": "4.2. Ablations",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "SVTS achieves on GRID seen speakers PESQ 1.97, STOI 0.705, ESTOI 0.523, and WER 2.36%; on GRID unseen speakers PESQ 1.40, STOI 0.588, ESTOI 0.318, and WER 17.85%; on LRW unseen speakers PESQ 1.49, STOI 0.649, ESTOI 0.483, and WER 13.4%. LRS3 tasks yield intelligible results without reported WER.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "section_or_location": "4.1. Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The approach depends on large, curated audiovisual datasets and trained models do not yet achieve high fidelity or robustness on unconstrained unseen data, with ASR evaluation unreliable on LRS3 generated speech and no user studies reported.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "section_or_location": "4. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "Deployment requires cropped and aligned lip video input and substantial training data; real-time usage is plausible given vocoder speed but not explicitly validated; end-user deployment requires further robustness and validation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_svts-scalable-video-to-speech-synthesis-svts-scalable-video-to-speech-synthesis.txt",
          "section_or_location": "2. Methodology",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms",
      "slug": "listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms",
      "title": "Listen only to me! How well can target speech extraction handle false alarms?",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Marc Delcroix",
        "Keisuke Kinoshita",
        "Tsubasa Ochiai",
        "Katerina Zmolikova",
        "Hiroshi Sato",
        "Tomohiro Nakatani"
      ],
      "url": "https://nao-ki-mura.com/paper/listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2204.04811",
      "arxiv_url": "https://arxiv.org/abs/2204.04811",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement"
      ],
      "expert_take_short": "Strong paper for false-alarm handling in TSE, wrong domain if someone tries to count it as SSI progress.",
      "expert_take_long": "Table 2 shows the tradeoff clearly. TSE-IS can directly output zeros, but at 10-second enrollment it drops to 10.8 dB SDRi before detection, 8.6% failure, 11.6% EER, and 13.4% fail-and-miss. TSE-V(360) is the better operating point: 13.6 dB SDRi before detection, 1.7% failure, 6.3% EER, and 7.1% fail-and-miss. Figure 4 sharpens the practical lesson: longer enrollment helps TSE-V approach roughly 5% EER around 15 to 20 seconds. That is a credible deployment result for TSE, but it has no silent-speech sensing or articulatory interface component.",
      "expert_true_value": "As a TSE paper, the full text is strong: the verification route keeps extraction quality closer to baseline while materially reducing false alarms, but it is still not an SSI paper.",
      "canon_before": "Target speech extraction papers usually assumed the enrolled speaker was always active, which hides false alarms at deployment time.",
      "delta_from_canon": "This paper makes inactive-speaker failure a first-class evaluation target and shows that verification-based handling is stronger than direct zero-output training.",
      "position_in_field": "Useful deployment-failure analysis for target speech extraction, outside the core SSI modality set.",
      "practical_value": "Relevant to voice-isolation systems that must know when not to output anything.",
      "axes_moved": "evaluation; deployment failure analysis",
      "axes_unresolved": "generalization beyond LibriMix; SSI relevance",
      "axes_regressed": "",
      "technical_limits": "Everything is benchmarked on LibriMix mixtures; there is no silent-speech modality or multimodal cue.",
      "evaluation_limits": "The evidence is specific to the SpeakerBeam family and the LibriMix training recipe reported here.",
      "deployment_limits": "TSE-V needs an extra verification stage and enrollment audio, and none of it addresses SSI interaction.",
      "scope_limits": "Target speech extraction with inactive-speaker handling only.",
      "task": "speech-enhancement",
      "input_modality": "speech mixture plus enrollment speech",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "SDRi before/after detection; fail rate; EER; fail-and-miss rate; attenuation; enrollment-duration curves",
      "evaluation_mode": "LibriMix extraction and active/inactive detection study using SDRi, fail rate, EER, and enrollment-duration sweeps",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The abstract says the paper studies how target speech extraction should handle inactive-speaker cases to avoid false alarms when the enrolled speaker is silent.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms-listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms-listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Table 1 describes the LibriMix setup with 13,900 Train-100k mixtures, 50,800 Train-360k mixtures, and 3,000-sample validation and test sets.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms-listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms-listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms.txt",
          "section_or_location": "Table 1: Description of the dataset",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "At 10-second enrollment, Table 2 reports TSE-V(360) at 13.6 dB SDRi before detection, 1.7% fail rate, 6.3% EER, and 7.1% fail-and-miss, outperforming TSE-IS on the same operating point.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms-listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms-listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms.txt",
          "section_or_location": "Table 2: Extraction and detection performance with enrollment of average duration of 10 sec.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.97,
          "statement": "Figure 4 and the accompanying discussion say TSE-V still needs 15 to 20 seconds of enrollment to approach about 5% EER and requires an additional verification step.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms-listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms-listen-only-to-me-how-well-can-target-speech-extraction-handle-false-alarms.txt",
          "section_or_location": "Figure 4: Extraction and AS/IS detection performance as a function of the enrollment duration.",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video",
      "slug": "multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video",
      "title": "Multi-modality Associative Bridging through Memory: Speech Sound Recollected from Face Video",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Minsu Kim",
        "Joanna Hong",
        "Se Jin Park",
        "Yong Man Ro"
      ],
      "url": "https://nao-ki-mura.com/paper/multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2204.01265",
      "arxiv_url": "https://arxiv.org/abs/2204.01265",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:multimodal",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "The key idea is not generic fusion; it is storing cross-modal correspondences so video-only decoding can recover some audio-side structure later.",
      "expert_take_long": "Table 1 is the cleanest recognition evidence: the proposed method reaches 85.4 on LRW and 50.82 on LRW-1000, which is especially meaningful because the LRW-1000 gap over the next best method is large. The reconstruction side is smaller but still real: Table 2 reports 0.738 STOI, 0.579 ESTOI, and 1.984 PESQ on speaker-dependent GRID, edging Lip2Wav and Yadav et al. The human study in Table 3 reinforces that the gains are audible, with 2.93 naturalness and 3.56 intelligibility before WaveNet, and 4.37/4.27 with the WaveNet vocoder. The catch is scope: the reconstruction result is speaker-dependent GRID, not broad in-the-wild SSI.",
      "expert_true_value": "The full text supports that the memory bridge is doing real work on benchmarks: gains are modest on LRW but large on LRW-1000 and enough to edge prior methods on GRID reconstruction.",
      "canon_before": "Audio-visual fusion and common-representation methods usually needed both modalities at inference or failed when one modality was missing.",
      "delta_from_canon": "This paper claims that memory can preserve cross-modal associations so a visual-only downstream model can still borrow audio structure.",
      "position_in_field": "Interesting cross-modal architectural paper adjacent to lip reading and lip-to-speech reconstruction.",
      "practical_value": "Useful when paired audio-video data exists at training time and the goal is to improve visual-only inference with recalled audio structure.",
      "axes_moved": "cross-modal system design; benchmark quality",
      "axes_unresolved": "speaker-independent reconstruction; in-the-wild deployment; online use",
      "axes_regressed": "",
      "technical_limits": "The speech-reconstruction claim remains on speaker-dependent GRID, and the best subjective scores require an added WaveNet vocoder.",
      "evaluation_limits": "No live deployment, no uncontrolled capture conditions, and no speaker-independent reconstruction result are shown.",
      "deployment_limits": "The method needs paired audio-video supervision and benchmark-like preprocessing.",
      "scope_limits": "Benchmark lip reading and speech reconstruction from face video.",
      "task": "speech-reconstruction",
      "input_modality": "silent face video with recalled audio memory during training",
      "sensor_hardware": "",
      "body_site": "face; lip",
      "output_type": "speech-audio",
      "vocabulary_type": "word-level lip reading plus fixed-phrase speech reconstruction",
      "vocabulary_size": "LRW/LRW-1000 word vocabularies and GRID phrase set",
      "metrics": "word accuracy; STOI; ESTOI; PESQ; human naturalness and intelligibility ratings",
      "evaluation_mode": "word-level lip-reading benchmarks on LRW/LRW-1000 plus speaker-dependent GRID speech reconstruction with objective and human evaluation",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The abstract says the paper introduces an audio-visual multimodal bridging framework that can exploit both modalities even when only unimodal input is available at inference.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video-multi-modality-associative-bridging-through-memory-speech-sound-recollected-from.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video-multi-modality-associative-bridging-through-memory-speech-sound-recollected-from.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "Table 1 reports the proposed method at 85.4 word accuracy on LRW and 50.82 on LRW-1000, ahead of the listed prior methods on both datasets.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video-multi-modality-associative-bridging-through-memory-speech-sound-recollected-from.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video-multi-modality-associative-bridging-through-memory-speech-sound-recollected-from.txt",
          "section_or_location": "Table 1. Lip reading word accuracy comparison with visual modal inputs on LRW and LRW-1000 dataset.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Section 4.2.1 says the speech-reconstruction experiments use speaker-dependent GRID with subjects 1, 2, 4, and 29 following prior work.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video-multi-modality-associative-bridging-through-memory-speech-sound-recollected-from.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video-multi-modality-associative-bridging-through-memory-speech-sound-recollected-from.txt",
          "section_or_location": "4.2.1 Dataset",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "Table 2 reports speaker-dependent GRID reconstruction scores of 0.738 STOI, 0.579 ESTOI, and 1.984 PESQ for the proposed method.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video-multi-modality-associative-bridging-through-memory-speech-sound-recollected-from.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_multi-modality-associative-bridging-through-memory-speech-sound-recollected-from-face-video-multi-modality-associative-bridging-through-memory-speech-sound-recollected-from.txt",
          "section_or_location": "Table 2. Performance of speech reconstruction comparison with visual modal inputs in a speaker-dependent setting on GRID.",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion",
      "slug": "vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion",
      "title": "VCVTS: Multi-speaker Video-to-Speech synthesis via cross-modal knowledge transfer from voice conversion",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Disong Wang",
        "Shan Yang",
        "Dan Su",
        "Xunying Liu",
        "Dong Yu",
        "Helen Meng"
      ],
      "url": "https://nao-ki-mura.com/paper/vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2202.09081",
      "arxiv_url": "https://arxiv.org/abs/2202.09081",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "The real move is importing structure from voice conversion, not just adding another speaker embedding.",
      "expert_take_long": "Table 1 is the anchor. On GRID seen speakers, VCVTS with GL reaches 1.816 PESQ, 0.691 STOI, 0.512 ESTOI, and 4.38 MOS speaker similarity, improving over XTS and Lip2Wav. On unseen GRID speakers, the same model reaches 1.417/0.582/0.330 with 3.25 MOS naturalness and 2.66 MOS speaker similarity, again ahead of the baselines listed there. LRW is harder, but the model still reaches 1.352 PESQ, 0.628 STOI, and 0.458 ESTOI with 3.68 MOS speaker similarity. The paper is strongest where the architecture is most explicit: Section 2.3 shows that the VTS system is literally composed from VC speaker and pitch modules plus a Lip2Ind front-end.",
      "expert_true_value": "The full text supports that the VC transfer route improves both seen and unseen speaker VTS, especially on GRID, and gives a more controlled multi-speaker pipeline than prior lip-to-speech baselines.",
      "canon_before": "Multi-speaker VTS was usually a black-box lip-to-speech mapping with weak intermediate structure and brittle speaker control.",
      "delta_from_canon": "VCVTS borrows interpretable discrete content units, a speaker encoder, and pitch control from voice conversion instead of learning VTS from scratch.",
      "position_in_field": "Strong multi-speaker video-to-speech systems paper with a clear cross-modal transfer story.",
      "practical_value": "Useful when VTS must preserve speaker identity and intelligibility across multiple speakers instead of retraining separate systems.",
      "axes_moved": "system design; speaker control; evaluation",
      "axes_unresolved": "reference-free control; more robust unseen-speaker generalization; live deployment",
      "axes_regressed": "",
      "technical_limits": "The seen/unseen gap remains large, and waveform quality still depends on the vocoder choice.",
      "evaluation_limits": "All evidence is offline benchmark evidence on GRID and LRW with reference speech available for speaker control.",
      "deployment_limits": "The system is complex, reference-speech-dependent, and not validated in an interactive SSI setting.",
      "scope_limits": "Multi-speaker VTS from lip video only.",
      "task": "speech-reconstruction",
      "input_modality": "silent lip video plus reference speech for speaker control",
      "sensor_hardware": "",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "constrained and open-vocabulary video-to-speech",
      "vocabulary_size": "GRID: 52 words; LRW: 500+ words",
      "metrics": "PESQ; STOI; ESTOI; MCD; F0-RMSE; MOS speech naturalness; MOS speaker similarity",
      "evaluation_mode": "objective and subjective VTS comparison on GRID and LRW for seen and unseen speakers",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The abstract presents VCVTS as a multi-speaker video-to-speech system built on cross-modal knowledge transfer from voice conversion.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion-vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion-vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.98,
          "statement": "Section 2.3 says the proposed system concatenates a Lip2Ind network with the speaker encoder, pitch predictor, and decoder of a voice-conversion system to form multi-speaker VTS.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion-vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion-vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer.txt",
          "section_or_location": "2.3. Multi-speaker VTS system",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Section 3 says experiments use GRID with seen and unseen speaker partitions and LRW with default splits for unconstrained evaluation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion-vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion-vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer.txt",
          "section_or_location": "3. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "Table 1 reports VCVTS on unseen GRID speakers at 1.417 PESQ, 0.582 STOI, 0.330 ESTOI, 3.25 MOS speech naturalness, and 2.66 MOS speaker similarity with Griffin-Lim.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion-vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer-from-voice-conversion-vcvts-multi-speaker-video-to-speech-synthesis-via-cross-modal-knowledge-transfer.txt",
          "section_or_location": "Table 1. Objective and subjective evaluation results of different VTS systems on testing speakers, where ‘Seen’ and ‘Unseen’ denote that",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals",
      "slug": "supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals",
      "title": "Supervised and Self-supervised Pretraining Based COVID-19 Detection Using Acoustic Breathing/Cough/Speech Signals",
      "year": 2022,
      "venue": "ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2022, pp. 561-565",
      "authors": [
        "Xing-Yu Chen",
        "Qiu-Shi Zhu",
        "Jie Zhang",
        "Li-Rong Dai"
      ],
      "url": "https://nao-ki-mura.com/paper/supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals",
      "doi": "10.1109/ICASSP43922.2022.9746205",
      "doi_url": "https://doi.org/10.1109/ICASSP43922.2022.9746205",
      "arxiv_id": "2201.08934",
      "arxiv_url": "https://arxiv.org/abs/2201.08934",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:labels",
        "task:audio-classification"
      ],
      "expert_take_short": "Sound classification paper, not SSI.",
      "expert_take_long": "The paper is honest about what it is. Section 3.1 ties the experiments to the DiCOVA 2022 challenge data, and Table 1 shows the supervised and self-supervised pretraining variants improving over the listed baselines on Track-1 breathing, with the ensemble reaching 86.72 test AUC and 80.05 validation AUC. The abstract then gives the stronger fusion-track headline number of 88.44% blind-test AUC. That is respectable benchmark engineering, but there is no silent-speech sensing, articulation modeling, or communication interface angle here.",
      "expert_true_value": "The full text supports a decent challenge entry, but it is plainly outside SSI: model ensemble reaches 86.72 AUC on Track-1 test and the paper headline reports 88.44% AUC on the blind fusion track.",
      "canon_before": "Low-resource acoustic diagnosis tasks often relied on hand-crafted features or single-model training recipes.",
      "delta_from_canon": "The paper frames the main gain as better pretraining and ensemble strategy, not a new SSI or speech-interface method.",
      "position_in_field": "Respiratory-audio classification paper that should remain marked as out of scope for SSI.",
      "practical_value": "Relevant for benchmark acoustic diagnosis, not for silent-speech interfaces.",
      "axes_moved": "audio classification; transfer learning",
      "axes_unresolved": "clinical generalization; SSI relevance",
      "axes_regressed": "",
      "technical_limits": "It is a medical audio classifier and offers no SSI relevance beyond generic acoustic modeling.",
      "evaluation_limits": "The evidence is challenge-specific and constrained by the DiCOVA data regime.",
      "deployment_limits": "No SSI deployment path exists in the paper.",
      "scope_limits": "COVID-19 detection from respiratory audio only.",
      "task": "audio-classification",
      "input_modality": "acoustic breathing, cough, and speech signals",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "binary diagnosis",
      "vocabulary_size": "2 classes",
      "metrics": "ROC-AUC on validation and blind test sets",
      "evaluation_mode": "5-fold cross-validation and blind-test benchmark reporting on the DiCOVA-ICASSP 2022 challenge",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The abstract presents a BiLSTM COVID-19 detector that uses breath, cough, and speech signals with supervised and self-supervised pretraining.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals-supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoust.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals-supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoust.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Section 3.1 ties the experiments to the DiCOVA-ICASSP 2022 challenge dataset derived from Coswara.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals-supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoust.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals-supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoust.txt",
          "section_or_location": "3.1. Datasets",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "Table 1 shows the Track-1 model ensemble at 86.72 test AUC and 80.05 validation AUC, and the abstract highlights 88.44% AUC on the blind fusion-track test.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals-supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoust.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoustic-breathing-cough-speech-signals-supervised-and-self-supervised-pretraining-based-covid-19-detection-using-acoust.txt",
          "section_or_location": "Table 1: The AUC score of different methods on test/validation sets.",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over",
      "slug": "visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over",
      "title": "VisualTTS: TTS with Accurate Lip-Speech Synchronization for Automatic Voice Over",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Junchen Lu",
        "Berrak Sisman",
        "Rui Liu",
        "Mingyang Zhang",
        "Haizhou Li"
      ],
      "url": "https://nao-ki-mura.com/paper/visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2110.03342",
      "arxiv_url": "https://arxiv.org/abs/2110.03342",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "VisualTTS effectively improves lip-speech synchronization in scripted voice over by conditioning TTS on lip video, but does not tackle silent speech decoding or unscripted scenarios.",
      "expert_take_long": "VisualTTS presents an audio-visual multi-speaker neural TTS system conditioned on both text and lip video inputs to synthesize speech synchronizing accurately with lip motions. Key innovations include textual-visual attention that aligns textual embeddings with visual lip embeddings extracted by a pretrained lip-reading model, and a visual fusion strategy that incorporates temporal visual features into the acoustic decoder. Experiments on the constrained GRID dataset show significant improvements in lip-speech synchronization metrics (LSE-C, LSE-D, frame disturbance) compared to Tacotron baselines, including one with textual-visual attention but without fusion. Subjective listening tests show no improvement in speech naturalness, indicating VisualTTS primarily enhances audiovisual synchrony. Limitations include reliance on scripted video-text input pairs, fixed vocabulary and speaker identities, and no demonstration of real-time synthesis or generalization. The work constitutes a meaningful contribution for dubbing and automated voice over applications but does not address silent speech recognition or free speaking SSI tasks, positioning it adjacent to but not core within silent speech interfaces.",
      "expert_true_value": "Demonstrates that integrating lip-video visual embeddings via novel textual-visual attention and visual fusion within TTS measurably improves audiovisual synchronization, advancing automatic voice over technology.",
      "canon_before": "Traditional TTS systems synthesize speech from text input without considering lip video or lip-speech temporal synchronization, often generating natural but temporally unsynchronized speech.",
      "delta_from_canon": "Unlike canonical single-modal TTS, VisualTTS integrates lip motion visual embeddings at alignment and acoustic decoding stages to optimize speech output timing to synchronize with video lip motion.",
      "position_in_field": "A specialized audiovisual synchronization-enhanced TTS related but not core to silent speech interfaces; does not decode speech from silent video without text input.",
      "practical_value": "Valuable for automatic voice over and dubbing applications with available scripted silent videos; improves perceived audiovisual synchrony without compromising naturalness.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "The system requires both text script and lip video as input; it does not address decoding from video-only silent speech or speaker adaptation to unseen speakers.",
      "axes_regressed": "",
      "technical_limits": "Requires paired text scripts aligned with video; training and evaluation restricted to fixed grammar GRID corpus; visual encoder weights fixed during TTS training; no speaker-independent or vocabulary adaptation shown.",
      "evaluation_limits": "Evaluation restricted to the scripted GRID dataset; no tests on spontaneous speech, large-vocabulary, unseen speakers, or real-time synthesis; subjective voice quality comparable to baselines with no improvement.",
      "deployment_limits": "Requires pre-recorded silent lip video and matching text script; limited to scripted GRID dataset utterances; unknown real-time capability; fixed speaker set and vocabulary; no adaptation to unseen speakers or spontaneous speech.",
      "scope_limits": "Limited to GRID dataset scripted utterances with paired silent video and text; no demonstration on spontaneous or unconstrained speech or unseen speakers.",
      "task": "automatic voice over with lip-speech synchronization",
      "input_modality": "Text script plus silent lip video from a mono camera capturing lip region.",
      "sensor_hardware": "Mono video camera capturing lip region from video frames.",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "Executing fixed grammar scripted sentences from predefined GRID dataset.",
      "vocabulary_size": "33,000 utterances from GRID corpus spanning 33 speakers with fixed sentence grammar.",
      "metrics": "LSE-C (higher better): 5.87; LSE-D (lower better): 8.45; Frame Disturbance (lower better): 5.92; MOS: 4.17±0.06 on GRID test set averaged across 33 speakers, comparing favorably in lip-speech synchronization to baselines.",
      "evaluation_mode": "Objective lip-speech synchronization metrics (LSE-C, LSE-D, Frame Disturbance) and subjective listening tests including MOS and preference tests on synthetic speech paired with test videos.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper formulates the automatic voice over (AVO) task and proposes VisualTTS, a visual-conditioned TTS model with textual-visual attention and visual fusion strategies for accurate lip-speech synchronization.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "VisualTTS adopts two novel mechanisms: 1) textual-visual attention (TVA) aligning textual and visual embeddings for alignment, and 2) visual fusion strategy during acoustic decoding to incorporate visual embeddings into mel-spectrogram generation, improving lip-speech synchronization.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "section_or_location": "3. VISUALTTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Experiments conducted on the GRID dataset with 33 speakers, 900 training and 100 test utterances each, use 24kHz audio and 25Hz video with lip region cropping.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "section_or_location": "4. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Lip-speech synchronization is evaluated with LSE-C (higher better), LSE-D (lower better), and Frame Disturbance (FD, lower better). VisualTTS achieves LSE-C 5.87, LSE-D 8.45, FD 5.92, outperforming baselines Tacotron and Tacotron with TVA.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "section_or_location": "4. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Subjective listening tests with 12 listeners show VisualTTS has comparable mean opinion score (MOS) 4.17±0.06 to baselines, indicating similar speech naturalness.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "section_or_location": "4. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Limitations include requirement of paired text script and silent video, evaluation only on scripted GRID dataset with fixed grammar and vocabulary, no demonstration on real-time synthesis, or generalization to unseen speakers or spontaneous speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "section_or_location": "5. CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "VisualTTS requires a pre-recorded silent video and matching text script for synthesis, limiting deployment to scripted applications like automatic dubbing; real-time capability and mobile suitability are not reported.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over-visualtts-tts-with-accurate-lip-speech-synchronization-for-automatic-voice-over.txt",
          "section_or_location": "3. VISUALTTS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language",
      "slug": "sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language",
      "title": "Sequence-to-Sequence Voice Reconstruction for Silent Speech in a Tonal Language",
      "year": 2022,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Huiyan Li",
        "Haohong Lin",
        "You Wang",
        "Hengyang Wang",
        "Ming Zhang",
        "Han Gao",
        "Qing Ai",
        "Zhiyuan Luo",
        "Guang Li"
      ],
      "url": "https://nao-ki-mura.com/paper/sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2108.00190",
      "arxiv_url": "https://arxiv.org/abs/2108.00190",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "deployment:hands-free",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:emg",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "SSRNet innovatively applies duration-aware Seq2Seq modeling and tonal multitask learning to reconstruct intelligible Mandarin speech from facial sEMG signals, markedly improving performance over prior methods but remains speaker-dependent with limited deployment evaluation.",
      "expert_take_long": "This work presents SSRNet, a novel duration-regulated Seq2Seq model for reconstructing audible speech from silent sEMG signals in Mandarin Chinese, a tonal language where tone preservation is essential. By extracting duration alignment via DTW and employing a learned duration predictor and length regulator, SSRNet aligns variable-length silent sEMG features with audio frame counts, enabling more accurate mel-spectrogram generation. The model also incorporates a multitask loss combining vocal sEMG reconstruction and toneme classification to enhance tonal feature fidelity. Trained and evaluated on a Mandarin silent speech corpus from six speakers, SSRNet demonstrates significantly reduced character error rates both objectively via automatic speech recognition and subjectively through human listeners, surpassing a baseline method by a wide margin. However, the approach remains speaker-dependent, with no cross-speaker or environment generalization tested, no assessment of latency or real-time capability, and limited to controlled read speech context. Overall, SSRNet advances tonal silent speech decoding by explicitly modeling timing and tonal information, providing a valuable architecture for future development though deployment challenges persist.",
      "expert_true_value": "Demonstrates effective duration-regulated Seq2Seq mapping with toneme supervision enabling practical Mandarin tonal silent speech reconstruction from sEMG, bridging the gap between neuromuscular signal decoding and natural sounding tonal speech synthesis in tonal languages.",
      "canon_before": "Prior sEMG silent speech reconstruction approaches mostly treated decoding as frame-level tasks in non-tonal languages, without explicit duration alignment or handling tonal features critical in Mandarin Chinese silent speech synthesis.",
      "delta_from_canon": "Introduces explicit duration extraction between silent sEMG and audio via DTW and integrates a duration predictor and length regulator to time-align input features, plus toneme classification loss for tonal information preservation, enabling effective tonal speech reconstruction in Mandarin.",
      "position_in_field": "State-of-the-art demonstration of tonal silent speech reconstruction using facial sEMG with duration-regulated Seq2Seq and tonal multitask learning, significantly improving over previous baselines but limited to controlled conditions and speakers.",
      "practical_value": "Provides foundational design and evaluation framework for tonal silent speech reconstruction valuable for privacy-preserving speech augmentation in noisy environments; key step towards practical Mandarin SSI with room to enhance real-time capability and robustness.",
      "axes_moved": "system_design;evaluation",
      "axes_unresolved": "Cross-speaker transfer, real-time latency, walking/mobile robustness",
      "axes_regressed": "",
      "technical_limits": "Speaker-dependent training, limited to controlled read sentences, no cross-speaker generalization, no evaluation of real-time latency or robustness to noise or motion, few electrodes with limited articulatory coverage, limited vocabulary",
      "evaluation_limits": "Evaluation limited to six native Mandarin speakers, fixed controlled read sentences; phoneme and tone classification limited to in-vocabulary tokens; no unseen word generalization or cross-domain tests; no mobile or real-time latency tests; subjective tests with 10 listeners.",
      "deployment_limits": "Speaker-dependent models trained on six specific Mandarin speakers; requires five facial sEMG electrodes with fixed placement; no reported real-time inference or latency analysis; lacks cross-speaker or cross-environment robustness evaluation; no assessment in walking or mobile conditions.",
      "scope_limits": "Limited to silent speech reconstruction from facial sEMG in Mandarin Chinese with fixed electrode setup; no evaluation in other languages, body sites, or conversational scenarios.",
      "task": "sEMG-based silent speech-to-voice reconstruction",
      "input_modality": "Five-channel facial surface electromyography (sEMG) recorded at 2000 Hz with Ag/AgCl electrodes placed near mouth and neck muscles",
      "sensor_hardware": "Five facial surface Ag/AgCl electrodes positioned near nose, mouth corners, and chin with a sampling frequency of 2000 Hz; differential electrode for channel 1 and single electrodes for others",
      "body_site": "face",
      "output_type": "speech-audio",
      "vocabulary_type": "Mandarin read sentences",
      "vocabulary_size": "2260 words / 1373 characters from AISHELL-3 corpus read sentences",
      "metrics": "Average objective ASR CER 21.99%±4.99% on six speakers; subjective CER 6.41% average (best 1.19%); accompanied by Mel-Cepstral Distortion (MCD) and Short-Term Objective Intelligibility (STOI) metrics; baseline CER was 46.62% objective and 39.76% subjective.",
      "evaluation_mode": "Quantitative: ASR character error rate (CER), Mel-Cepstral Distortion (MCD), Short-Term Objective Intelligibility (STOI). Qualitative: Human listener transcription CER and naturalness ratings per speaker.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.98,
          "statement": "We propose SSRNet, the first Seq2Seq-based method for silent speech reconstruction in Mandarin Chinese, extracting duration alignment between silent sEMG and vocal audio, with joint toneme classification and vocal sEMG reconstruction losses, and use a state-of-the-art vocoder to generate audio waveform.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Five channels of facial sEMG are recorded using wet Ag/AgCl electrodes positioned around the face and neck, sampled at 2000 Hz. Electrodes are placed 1cm from nose, mouth corners, and chin, with one channel differential.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "II. DATA ACQUISITION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Dataset includes six native Mandarin speakers reading phonetically balanced AISHELL-3 sentences, totaling about 5.79 hours of silent speech, split 8:1:1 for training, validation, testing per speaker.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "II. DATA ACQUISITION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "SSRNet introduces a duration extractor using DTW to obtain alignment between silent sEMG and audio, duration predictor to predict length alignment, and length regulator to resample features, enabling Seq2Seq mel-spectrogram prediction with tonal-aware multitask learning.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "III. THE PROPOSED METHODS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "On six speakers, SSRNet achieves average ASR character error rate (CER) of 21.99% (SD 4.99%), outperforming baseline with CER 46.62% objectively. Subjective human transcription CER is 6.41% average (best 1.19%) across six speakers, with naturalness ratings improved.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.95,
          "statement": "The system uses speaker-dependent training, limited to six Mandarin speakers with no cross-speaker or cross-environment generalization tested, no latency or real-time inference evaluation, and no mobile or walking scenario testing.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "The approach requires wearing five specific facial sEMG electrodes; no real-time operation, cross-speaker transfer, or mobile scenario tested, limiting deployment readiness currently to controlled lab environments.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "II. DATA ACQUISITION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Output audio waveforms are generated by a pre-trained Parallel WaveGAN vocoder conditioned on predicted mel-spectrograms.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "III. THE PROPOSED METHODS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Training loss includes mean absolute error between predicted and target mel-spectrograms, mean squared error for predicted duration against ground truth from DTW alignment, cross-entropy toneme classification loss, and a vocal sEMG reconstruction loss, jointly optimized.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "III. THE PROPOSED METHODS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.98,
          "statement": "Evaluation metrics include objective ASR CER via Mandarin ASR system, Mel-Cepstral Distortion (MCD), Short-Term Objective Intelligibility (STOI), as well as subjective human listener transcriptions and naturalness ratings over reconstructed voices from six speakers.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language-sequence-to-sequence-voice-reconstruction-for-silent-speech-in-a-tonal-language.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "kimura2022_silentspeller",
      "slug": "silentspeller",
      "title": "SilentSpeller: Towards mobile, hands-free, silent speech text entry using electropalatography",
      "year": 2022,
      "venue": "CHI '22",
      "authors": [
        "Naoki Kimura",
        "Tan Gemicioglu",
        "Jonathan Womack",
        "Yuhui Zhao",
        "Richard Li",
        "Abdelkareem Bedri",
        "Zixiong Su",
        "Alex Olwal",
        "Jun Rekimoto",
        "Thad Starner"
      ],
      "url": "https://nao-ki-mura.com/paper/silentspeller",
      "doi": "10.1145/3491102.3502015",
      "doi_url": "https://doi.org/10.1145/3491102.3502015",
      "arxiv_id": "",
      "arxiv_url": "",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+existing_expert_seed",
      "source_coverage": "high",
      "tags": [
        "body_site:palate",
        "body_site:tongue",
        "deployment:hands-free",
        "deployment:mobile-suitable",
        "deployment:real-time",
        "deployment:speaker-dependent",
        "deployment:wearable",
        "evaluation:quantitative",
        "evaluation:unseen-words",
        "evaluation:walking-tested",
        "modality:electropalatography",
        "output:text",
        "task:text-entry"
      ],
      "expert_take_short": "SilentSpeller is a strong, rigorously tested SSI system that reframes silent speech as silent spelling, enabling large vocabulary, live text entry, and walking robustness with in-mouth electropalatography sensors.",
      "expert_take_long": "SilentSpeller offers a carefully validated alternative to classic silent speech interfaces by changing the recognition task from continuous silent speech to discrete silent spelling. This reframing produces a more structured signal that, together with electropalatography sensors and a user-dependent HMM recognizer, supports a large vocabulary of 1164 words with high offline accuracy. The system uniquely supports live interactive text entry at speeds around 37 wpm with 87% accuracy, including robust performance while walking, demonstrating tolerance to motion artifacts. Limitations include the requirement for custom dental impressions, obtrusive mouth hardware, strong user dependence for training, and struggles with user independence and social acceptability. Overall, the work advances SSI toward viable mobile, hands-free text entry applications in privacy-sensitive or hands-busy scenarios, providing extensive empirical evidence backing claims.",
      "expert_true_value": "The paper's main contribution is the novel problem reframing from silent speech to silent spelling using an electropalatography retainer, which yields practical live text entry over large vocabularies including unseen words, with empirical validation of robustness to walking and comparison to mainstream mobile text input.",
      "canon_before": "Most silent speech interfaces were limited to small vocabularies (~100 words), stationary use, and offline, non-interactive experiments, with little evidence for practical live text entry.",
      "delta_from_canon": "The key change is reframing the task from silent speech recognition to silent spelling recognition, allowing a larger vocabulary (1164 offline words), robust unseen-word generalization, tolerance to walking motion, and live hands-free text entry at reasonable speeds (~37 wpm average).",
      "position_in_field": "One of the clearest practical SSI task-reframing papers to date; less natural than silent speech but more usable for mobile text entry.",
      "practical_value": "High for privacy-sensitive communication and hands-busy users; useful where speech is socially inappropriate and users can manage oral hardware.",
      "axes_moved": "large_vocabulary; unseen_word_generalization; walking_robustness; live_text_entry; hands_free_texting",
      "axes_unresolved": "user_independence; comfort; social_acceptability; broader_symbol_input",
      "axes_regressed": "",
      "technical_limits": "Recognition confusions occur mainly for letters with similar palatograms, especially EE-sound letters (B/P, D/T/Z). Strong user-dependence; user-independent recognition remains poor.",
      "evaluation_limits": "Offline experiments rely on data from only two main participants for tuning; live text entry and walking tests include seven users but under constrained phrase tasks; vocabulary is limited to English letters and space without punctuation or capitalization.",
      "deployment_limits": "The system requires a custom-fitted in-mouth SmartPalate retainer with 124 electrodes connected by a wired (now partially wireless prototype) interface. The retainer remains obtrusive, and the user-dependent training required limits scalability and ease of deployment.",
      "scope_limits": "The system targets discreet text entry, explicitly trading away naturalness of silent speech for reliability; not a conversational silent speech system.",
      "task": "text-entry using silent spelling",
      "input_modality": "electropalatography",
      "sensor_hardware": "SmartPalate custom dental retainer with 124 capacitive electrodes sampled at 100 Hz, connected wired or wireless to processing device.",
      "body_site": "palate; tongue",
      "output_type": "text",
      "vocabulary_type": "Dictionary-based silent spelling with triletter HMM decoding, phrase composition, and bigram correction",
      "vocabulary_size": "1164 isolated-word dictionary offline; live text entry with 321 unique words in phrase corpus",
      "metrics": "Offline HMM accuracy: ~97% character, 92% word; Unseen word offline test: 94.5% character, 85.5% word accuracy; Walking and seated phrase recognition: ~97.5% vs 96.5% character accuracy; Live text entry average 37 words per minute at 87% accuracy, best participant 53 wpm at 91%.",
      "evaluation_mode": "Offline isolated word recognition with 10-fold cross validation; reserve testing on 100 unseen words; seated vs walking phrase recognition; live interactive text entry with push-to-talk interface and edit gestures.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We introduce silent spelling as an alternative for silent speech interaction (SSI). In silent spelling, the user spells words without voicing, which increases signal reliability and enables larger vocabularies (1164 words in this work) and on-the-go interaction.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "section_or_location": "1 INTRODUCTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Offline HMM recognition on 2328 isolated words (1164 unique) achieves an average of 97% character accuracy and 92% word accuracy; unseen word testing on 100 words removed from training achieves 94.5% character and 85.5% word accuracy; walking phrase recognition yields 97.5% character accuracy walking vs. 96.5% seated; live text entry across seven users averaged 37 words per minute at 87% accuracy, with a top participant reaching 53 wpm at 91% accuracy.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "section_or_location": "TUNING MODELS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "SilentSpeller employs a custom in-mouth SmartPalate retainer with 124 binary capacitive electrodes at 100 Hz, which senses tongue-palate contact patterns; a custom dental impression is required.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "section_or_location": "3.2 SMARTPALATE",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluation includes offline isolated word recognition with ten-fold cross validation on two participants, unseen word tests with 100 randomly removed words, walking vs seated phrase recognition on two participants, and live interactive text entry experiments on seven participants using 321-word phrase vocabulary.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "section_or_location": "TUNING MODELS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "SilentSpeller is wearable, hands-free, mobile suitable and real-time capable; however, current hardware requires a wired connection to external hardware, making it obtrusive. Wireless prototypes using Bluetooth Low Energy dongles have been developed, but the device remains obtrusive and requires custom dental impressions, limiting deployment readiness.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "section_or_location": "3.2 SMARTPALATE",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "The system's user-dependent training regime requires about 1-2 hours of data collection (2328 isolated words) per participant to reach optimal accuracy; user independent models tested so far yield poor accuracy (~55% character), limiting deployment readiness.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "section_or_location": "4.3 TUNING USER DEPENDENT RECOGNIZERS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "Although SilentSpeller shows little performance degradation during walking, the current in-mouth retainer and wired setup remain intrusive and socially conspicuous, which social acceptability remains unresolved and limits deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "section_or_location": "8 DISCUSSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "SilentSpeller recognizes sequences of 26 English letters spelled silently using tongue-palate contact patterns, decoding via HMMs with triletter states, using PCA features reduced to 16 top eigen-palate components, sampled at 100 Hz.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2022_silentspeller-silentspeller-towards-mobile-hands-free-silent-speech-text-entry-using-electropa.txt",
          "section_or_location": "3.3 RECOGNIZER PIPELINE",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data",
      "slug": "sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data",
      "title": "SA-SDR: A novel loss function for separation of meeting style data",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Thilo von Neumann",
        "Keisuke Kinoshita",
        "Christoph Boeddeker",
        "Marc Delcroix",
        "Reinhold Haeb-Umbach"
      ],
      "url": "https://nao-ki-mura.com/paper/sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2110.15581",
      "arxiv_url": "https://arxiv.org/abs/2110.15581",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "expert_take_short": "Elegant loss fix, not SSI.",
      "expert_take_long": "The paper matters because it removes a genuine pathology in SDR-based training instead of papering over it with ad hoc constants. SA-SDR is competitive on WSJ0-2mix and clearly useful on meeting-style data, where the main difficulty is silence and partial overlap. That makes it a solid speech-separation loss paper, but still outside silent speech interfaces except as very indirect context.",
      "expert_true_value": "The real contribution is objective design, not a new separator: SA-SDR gives a cleaner way to train on realistic meeting data with silent targets.",
      "canon_before": "Speech separation losses usually average per-output SDR terms and become unstable on silence-heavy meeting data.",
      "delta_from_canon": "Aggregates all outputs into one global SDR objective instead of averaging channel-wise SDR losses.",
      "position_in_field": "Strong speech-separation loss paper, not an SSI paper.",
      "practical_value": "Useful if you need a more stable source-separation objective on overlap patterns that include silence.",
      "axes_moved": "loss design; evaluation",
      "axes_unresolved": "generalization across separators; downstream ASR impact beyond the tested setup",
      "axes_regressed": "",
      "technical_limits": "This is still a speech-separation study with standard acoustic mixtures and no silent-speech modality.",
      "evaluation_limits": "The evidence is benchmark-only and does not test downstream SSI or human-facing systems.",
      "deployment_limits": "No silent-interface deployment path is discussed.",
      "scope_limits": "Speech separation on meeting-style audio mixtures only.",
      "task": "speech separation loss for meeting-style data",
      "input_modality": "speech audio",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "separated speech audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "WSJ0-2mix BSSEval SDR 18.0 with SA-SDR versus 17.8 with A-SDR; on meeting-style data SA-SDR reaches 19.8 BSSEval SDR and 16.1 SA-SDR while SA-tSDR reaches 17.9 SA-SDR",
      "evaluation_mode": "WSJ0-2mix and meeting-style separation comparison using BSSEval SDR, SA-SDR, WER, attenuation ratio, and VAER",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The paper proposes source-aggregated SDR as a global objective that stays robust when one reference channel is silent or perfectly reconstructed.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data-sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data-sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.97,
          "statement": "The method changes how SDR is aggregated across separator outputs, switching from channel-wise averaging to one global SDR over all outputs.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data-sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data-sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data.txt",
          "section_or_location": "3. AGGREGATING SDR ACROSS OUTPUTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "On meeting-style data, the SA-SDR loss variant reports 19.8 BSSEval SDR and 16.1 SA-SDR, while SA-tSDR reaches 17.9 SA-SDR.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data-sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data-sa-sdr-a-novel-loss-function-for-separation-of-meeting-style-data.txt",
          "section_or_location": "Table 2. Comparison of the separation performance of SDR variants on meeting-style data.",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_advances-and-challenges-in-deep-lip-reading",
      "slug": "advances-and-challenges-in-deep-lip-reading",
      "title": "Advances and Challenges in Deep Lip Reading",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Marzieh Oghbaie",
        "Arian Sabaghi",
        "Kooshan Hashemifard",
        "Mohammad Akbari"
      ],
      "url": "https://nao-ki-mura.com/paper/advances-and-challenges-in-deep-lip-reading",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2110.07879",
      "arxiv_url": "https://arxiv.org/abs/2110.07879",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "task:survey"
      ],
      "expert_take_short": "Good survey, not a model result.",
      "expert_take_long": "The paper is strongest as field organization. The introduction explicitly says the survey focuses on dataset obstacles, evaluation metrics, and impediments across the VSR pipeline. Section 3.1.2 reviews why in-the-wild datasets matter because controlled corpora do not transfer cleanly to real-world conditions. Section 3.4 then summarizes the metric families, including word accuracy, sentence accuracy, error-rate metrics, and BLEU. That makes it useful background for SSI-adjacent visual speech work, but it cannot be cited as evidence that any specific lip-reading or lip-to-speech system works.",
      "expert_true_value": "The full text supports using this paper as orientation, not as system evidence: it is a structured review of where lip reading was succeeding and where data and evaluation were still bottlenecks.",
      "canon_before": "The lip-reading literature was growing quickly, but its datasets, task variants, and evaluation practices were still fragmented.",
      "delta_from_canon": "This paper organizes the field into datasets, pipeline modules, data challenges, and evaluation metrics rather than proposing another model.",
      "position_in_field": "Background survey for visual speech recognition and SSI-adjacent lip-reading context.",
      "practical_value": "Useful for mapping datasets, metrics, and open problems before choosing a technical direction.",
      "axes_moved": "field synthesis; evaluation framing",
      "axes_unresolved": "survey coverage is bounded by the literature up to 2021",
      "axes_regressed": "",
      "technical_limits": "Survey article; it does not contribute a new model or experimental benchmark of its own.",
      "evaluation_limits": "All claims are literature synthesis rather than original experiments.",
      "deployment_limits": "No deployment path is evaluated because this is a review paper.",
      "scope_limits": "Deep lip-reading survey only.",
      "task": "survey",
      "input_modality": "video",
      "sensor_hardware": "",
      "body_site": "face; lip",
      "output_type": "",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "surveyed metrics include word accuracy, sentence accuracy rate, error-rate family metrics, and BLEU",
      "evaluation_mode": "literature survey over datasets, pipeline modules, challenges, and evaluation criteria in deep lip reading",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.98,
          "statement": "The abstract frames the paper as a comprehensive survey of deep-learning-based visual speech recognition focused on data challenges, task complications, and corresponding solutions.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_advances-and-challenges-in-deep-lip-reading-advances-and-challenges-in-deep-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_advances-and-challenges-in-deep-lip-reading-advances-and-challenges-in-deep-lip-reading.txt",
          "section_or_location": "A BSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.97,
          "statement": "The introduction says the survey specifically reviews major datasets, metrics, sub-modules of the VSR pipeline, and open problems rather than proposing a new model.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_advances-and-challenges-in-deep-lip-reading-advances-and-challenges-in-deep-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_advances-and-challenges-in-deep-lip-reading-advances-and-challenges-in-deep-lip-reading.txt",
          "section_or_location": "1       Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.97,
          "statement": "Section 3.1.2 explains why in-the-wild datasets are necessary, noting that controlled datasets do not transfer cleanly to real-world conditions.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_advances-and-challenges-in-deep-lip-reading-advances-and-challenges-in-deep-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_advances-and-challenges-in-deep-lip-reading-advances-and-challenges-in-deep-lip-reading.txt",
          "section_or_location": "3.1.2   Lip Reading Datasets in the Wild",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.97,
          "statement": "Section 3.4 reviews the field's evaluation metrics, including word accuracy, sentence accuracy, error-rate metrics, and BLEU.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_advances-and-challenges-in-deep-lip-reading-advances-and-challenges-in-deep-lip-reading.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_advances-and-challenges-in-deep-lip-reading-advances-and-challenges-in-deep-lip-reading.txt",
          "section_or_location": "3.4    Evaluation Criteria",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_sub-word-level-lip-reading-with-visual-attention",
      "slug": "sub-word-level-lip-reading-with-visual-attention",
      "title": "Sub-word Level Lip Reading With Visual Attention",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "K R Prajwal",
        "Triantafyllos Afouras",
        "Andrew Zisserman"
      ],
      "url": "https://nao-ki-mura.com/paper/sub-word-level-lip-reading-with-visual-attention",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2110.07603",
      "arxiv_url": "https://arxiv.org/abs/2110.07603",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:hands-free",
        "evaluation:unseen-words"
      ],
      "expert_take_short": "Major lip-reading gain, adjacent to SSI.",
      "expert_take_long": "The paper is materially stronger than prior public-data lip reading systems. The final model reaches 28.9 WER on LRS2 using only public data, and 22.6 with additional data, while the ablation table shows the gains are not accidental: WordPiece decoding and visual transformer pooling each buy substantial error reduction. The scope caveat is straightforward. This is a camera-based VSR system, not a tongue, EMG, or ultrasound SSI device.",
      "expert_true_value": "This is a strong visual speech recognition paper with real benchmark gains, but it is camera-only lip reading rather than an articulatory SSI modality.",
      "canon_before": "Lip-reading systems often used character-level decoding and average pooling over face features, leaving performance and data efficiency on the table.",
      "delta_from_canon": "Introduces visual transformer pooling and WordPiece decoding, then reuses the encoder for visual speech detection.",
      "position_in_field": "Top-tier lip-reading paper adjacent to SSI.",
      "practical_value": "Useful when the problem is silent-video speech recognition and data-efficient visual speech detection, especially without audio.",
      "axes_moved": "model quality; data efficiency; transfer to VSD",
      "axes_unresolved": "in-the-wild silent dictation; occlusion robustness; privacy-sensitive deployment",
      "axes_regressed": "",
      "technical_limits": "The system still depends on face video quality and benchmark-style training corpora.",
      "evaluation_limits": "All results are offline benchmarks; no live silent dictation study is reported.",
      "deployment_limits": "No discussion of on-device latency, privacy, or in-the-wild robustness beyond benchmarks.",
      "scope_limits": "Camera-only visual speech recognition and detection.",
      "task": "lip reading",
      "input_modality": "silent video",
      "sensor_hardware": "camera",
      "body_site": "face; lip",
      "output_type": "text",
      "vocabulary_type": "WordPiece sub-word units",
      "vocabulary_size": "30522 tokens",
      "metrics": "Public-data training reaches 28.9 WER on LRS2 and 40.6 on LRS3; extended training reaches 22.6 and 30.7; WordPiece reduces LRS2 WER from 41.0 to 37.2 and VTP further to 30.9",
      "evaluation_mode": "LRS2 and LRS3 WER benchmarks with ablations, plus AVA ActiveSpeaker visual speech detection transfer",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The paper proposes attention-based pooling, sub-word units, and a visual speech detection model to improve lip reading from silent videos.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sub-word-level-lip-reading-with-visual-attention-sub-word-level-lip-reading-with-visual-attention.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sub-word-level-lip-reading-with-visual-attention-sub-word-level-lip-reading-with-visual-attention.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "The best public-data model reports 28.9 WER on LRS2 and 40.6 on LRS3, while the larger-data version reaches 22.6 and 30.7.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sub-word-level-lip-reading-with-visual-attention-sub-word-level-lip-reading-with-visual-attention.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sub-word-level-lip-reading-with-visual-attention-sub-word-level-lip-reading-with-visual-attention.txt",
          "section_or_location": "Table 1. Comparison of different lip reading models on the test sets of the LRS2 and LRS3 datasets in terms of Word Error Rate % (WER,",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.98,
          "statement": "The ablation table shows WordPiece tokenization improves LRS2 WER from 41.0 to 37.2 and adding VTP further reduces it to 30.9, so both proposed changes do real work.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sub-word-level-lip-reading-with-visual-attention-sub-word-level-lip-reading-with-visual-attention.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sub-word-level-lip-reading-with-visual-attention-sub-word-level-lip-reading-with-visual-attention.txt",
          "section_or_location": "Table 2. Ablation on the design improvements proposed in this",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input",
      "slug": "speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input",
      "title": "Speech Synthesis from Text and Ultrasound Tongue Image-based Articulatory Input",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Tamas Gabor Csapo",
        "Laszlo Toth",
        "Gabor Gosztolya",
        "Alexandra Marko"
      ],
      "url": "https://nao-ki-mura.com/paper/speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2107.02003",
      "arxiv_url": "https://arxiv.org/abs/2107.02003",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "deployment:hands-free",
        "deployment:wearable",
        "deployment:speaker-dependent"
      ],
      "expert_take_short": "Helpful side information, not standalone SSI.",
      "expert_take_long": "The paper earns its improvement claim. For every speaker in Table 1, the combined text-plus-ultrasound system beats text-only MCD by a small but consistent margin, which is exactly what you want from side information in a limited-data setup. But the same full text also states the system is not suitable for direct SSI because the best pipeline needs both text and articulatory input, and probe misalignment remains a serious failure mode.",
      "expert_true_value": "The full text supports a narrower claim than the title suggests: the best system is not a standalone SSI but a text-to-speech system helped by ultrasound side information under limited data.",
      "canon_before": "Articulatory-to-speech work usually used ultrasound alone, while DNN-TTS usually used text alone.",
      "delta_from_canon": "Combines conventional text-side linguistic features with ultrasound-derived articulatory features inside one DNN-TTS pipeline.",
      "position_in_field": "Adjacent articulatory-synthesis paper relevant to SSI but not itself a complete silent-speech interface.",
      "practical_value": "Useful when limited-data TTS can exploit synchronized articulatory recordings to improve spectral prediction.",
      "axes_moved": "system design; limited-data synthesis analysis",
      "axes_unresolved": "speaker independence; direct SSI without text; probe-robust capture",
      "axes_regressed": "",
      "technical_limits": "Speaker-dependent training, limited data, and sensitivity to probe misalignment constrain the result.",
      "evaluation_limits": "All evidence is speaker-dependent offline synthesis metrics without listening tests.",
      "deployment_limits": "The best system still requires text input and a probe-mounted headset.",
      "scope_limits": "Articulatory-augmented TTS, not pure silent-speech reconstruction.",
      "task": "speech synthesis from text plus ultrasound articulatory input",
      "input_modality": "text and ultrasound tongue images",
      "sensor_hardware": "Articulate Instruments Micro ultrasound system with probe-fixing headset",
      "body_site": "tongue",
      "output_type": "speech audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Combined text plus ultrasound yields the lowest test MCD for all 8 speakers, for example 5.442 for 03mn and 5.236 for 06fe versus 5.652 and 5.447 for text-only; ultrasound-only remains far worse at 7.153 and 7.050",
      "evaluation_mode": "8-speaker dev/test evaluation with MCD, BAP, F0-RMSE, F0-CORR, F0-VUV and probe-misalignment analysis",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The study extends traditional vocoder-based DNN-TTS with articulatory input estimated from ultrasound tongue images and compares text-only, ultrasound-only, and combined inputs.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input-speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input-speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "For every reported speaker in Table 1, the combined text-plus-ultrasound system has lower test MCD than text-only, for example 5.442 versus 5.652 for 03mn and 5.236 versus 5.447 for 06fe.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input-speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input-speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input.txt",
          "section_or_location": "Table 1: MCD errors on the dev/test set.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.98,
          "statement": "The transducer-position analysis shows several speakers have clearly separated test utterances in the misalignment matrix, and those speakers also show higher MCD errors.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input-speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input-speech-synthesis-from-text-and-ultrasound-tongue-image-based-articulatory-input.txt",
          "section_or_location": "4. Effect of ultrasound transducer position",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits",
      "slug": "sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits",
      "title": "Sparsely Overlapped Speech Training in the Time Domain: Joint Learning of Target Speech Separation and Personal VAD Benefits",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Qingjian Lin",
        "Lin Yang",
        "Xuyang Wang",
        "Luyuan Xie",
        "Chen Jia",
        "Junjie Wang"
      ],
      "url": "https://nao-ki-mura.com/paper/sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2106.14371",
      "arxiv_url": "https://arxiv.org/abs/2106.14371",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "expert_take_short": "Useful separation engineering, not silent speech.",
      "expert_take_long": "The paper does real work on a real mismatch: most target separation models assume overlap all the time, but conversations are often sparse. Weighted SI-SNR and the personal VAD branch let the model exploit those sparse regions and gain 4.17 dB on clean SparseLibri2Mix. The price is scope: this remains acoustic target separation with speaker embeddings, not a silent-speech or articulatory interface system.",
      "expert_true_value": "The paper is a credible engineering step for target speech separation on realistic overlap patterns, but it is not an SSI contribution except by loose analogy to activity detection.",
      "canon_before": "Time-domain target separation systems usually train on fully overlapped mixtures and break when the target is absent because SI-SNR is undefined.",
      "delta_from_canon": "Treats sparse overlap and target absence as first-class training conditions via weighted SI-SNR and a personal VAD branch.",
      "position_in_field": "Speech-separation systems paper, outside SSI.",
      "practical_value": "Useful if the actual problem is speaker-conditioned separation under sparse overlap, especially when inference cost matters.",
      "axes_moved": "training objective; inference efficiency",
      "axes_unresolved": "noise robustness; wider domain generalization",
      "axes_regressed": "",
      "technical_limits": "Still standard acoustic source separation with target-speaker embeddings, not silent or articulatory input.",
      "evaluation_limits": "Benchmarks are offline and synthetic or semi-synthetic.",
      "deployment_limits": "No SSI hardware, user study, or silent communication loop exists.",
      "scope_limits": "Target speech separation on overlapped audio mixtures.",
      "task": "target speech separation with personal VAD",
      "input_modality": "speech audio",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "separated speech audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Improves the baseline by 1.73 dB SDR on fully overlapped speech, 4.17 dB average SDR on clean sparse overlap, and 0.9 dB on noisy sparse overlap; early VAD branching reduces RTF from 0.61 to 0.47",
      "evaluation_mode": "fully overlapped VoiceFilter-style evaluation plus SparseLibri2Mix clean/noisy sparse-overlap SDRi, SI-SNRi, and real-time-factor studies",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The paper proposes weighted SI-SNR together with joint learning of target speech separation and personal VAD for sparsely overlapped speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits-sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits-sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "On sparse clean speech, the proposed model outperforms the baseline by 4.17 dB SDR on average, with the largest gain coming from audios without overlap.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits-sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits-sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-.txt",
          "section_or_location": "E. Results on Sparsely Overlapped Speech",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.97,
          "statement": "Connecting the personal VAD branch earlier reduces the real-time factor from 0.61 to 0.47 at the cost of some SDR degradation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits-sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-speech-separation-and-personal-vad-benefits-sparsely-overlapped-speech-training-in-the-time-domain-joint-learning-of-target-.txt",
          "section_or_location": "F. Faster Inference",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri",
      "slug": "silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri",
      "title": "Silent Speech and Emotion Recognition from Vocal Tract Shape Dynamics in Real-Time MRI",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Laxmi Pandey",
        "Ahmed Sabbir Arif"
      ],
      "url": "https://nao-ki-mura.com/paper/silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2106.08706",
      "arxiv_url": "https://arxiv.org/abs/2106.08706",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:oral-cavity",
        "body_site:palate",
        "body_site:throat",
        "body_site:tongue",
        "modality:magnetic",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:hands-free"
      ],
      "expert_take_short": "Strong rtMRI recognition result, weak deployment story.",
      "expert_take_long": "The paper earns its core claim. On USC-TIMIT it reaches 40.6% PER with the LM, which is materially better than the older rtMRI studies the authors cite. The second contribution is not filler: the emotion analysis shows systematic lower-boundary distortions and gender differences across vocal-tract subregions. But nothing here changes the fact that rtMRI is expensive, immobile, and unsuitable for day-to-day SSI deployment.",
      "expert_true_value": "The sentence-level rtMRI recognizer is real and clearly stronger than the cited earlier rtMRI baselines, but the modality remains a laboratory instrument rather than a deployable SSI path.",
      "canon_before": "rtMRI speech work mostly stayed at smaller recognition units or articulatory analysis rather than sentence-level transcription.",
      "delta_from_canon": "Pushes rtMRI to sentence-level text output and links recognition with a second analysis of emotion-dependent geometry.",
      "position_in_field": "Important articulatory recognition paper at the edge of SSI scope.",
      "practical_value": "Useful for understanding what full vocal-tract imaging can encode and for benchmarking articulatory recognition under rich but impractical sensing.",
      "axes_moved": "recognition; analysis",
      "axes_unresolved": "scaling; portability; broader vocabularies and speakers",
      "axes_regressed": "",
      "technical_limits": "The sensing hardware is large and immobile, and the data scope is narrow relative to practical SSI needs.",
      "evaluation_limits": "The recognition evidence is limited to USC-TIMIT and offline decoding.",
      "deployment_limits": "No deployable hardware path, latency study, or accessibility trial exists.",
      "scope_limits": "Laboratory rtMRI recognition and articulatory analysis.",
      "task": "sentence-level speech recognition from rtMRI",
      "input_modality": "real-time MRI video",
      "sensor_hardware": "real-time MRI scanner",
      "body_site": "lip; oral-cavity; palate; throat; tongue",
      "output_type": "text",
      "vocabulary_type": "sentence transcription",
      "vocabulary_size": "",
      "metrics": "40.6% PER on USC-TIMIT with language model; prior cited rtMRI baselines were 58% error on VCV recognition and 57% error on phoneme classification",
      "evaluation_mode": "USC-TIMIT recognition on unseen data plus emotion and gender articulation analysis on USC-EMO-MRI",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The framework translates variable-length rtMRI vocal-tract shape sequences into text using an end-to-end deep neural network.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri-silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-ti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri-silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-ti.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "The phrases-with-LM model reports 40.6 PER, 39.4 CER, and 42.1 WER on USC-TIMIT unseen data.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri-silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-ti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri-silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-ti.txt",
          "section_or_location": "Table 2: Performance of the three examined speech recognition models exploiting vocal tract dynamics on unseen data.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.97,
          "statement": "The discussion frames the approach as potentially useful but explicitly notes that the enabling technology for day-to-day use does not yet exist.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri-silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-ti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-time-mri-silent-speech-and-emotion-recognition-from-vocal-tract-shape-dynamics-in-real-ti.txt",
          "section_or_location": "7   DISCUSSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces",
      "slug": "neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces",
      "title": "Neural Speaker Embeddings for Ultrasound-based Silent Speech Interfaces",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Amin Honarmandi Shandiz",
        "László Tóth",
        "Gábor Gosztolya",
        "Alexandra Markó",
        "Tamás Gábor Csapó"
      ],
      "url": "https://nao-ki-mura.com/paper/neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2106.04552",
      "arxiv_url": "https://arxiv.org/abs/2106.04552",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative"
      ],
      "expert_take_short": "The ultrasound-based x-vector speaker embedding is highly effective for speaker recognition, achieving under 1% error on unseen speakers, but its integration yields only a marginal improvement in multi-speaker ultrasound-to-speech synthesis accuracy.",
      "expert_take_long": "This work presents a well-executed adaptation of the x-vector speaker embedding architecture to ultrasound tongue imaging for silent speech interfaces. The authors trained a 3D-CNN based x-vector network on 50 speakers from the TaL80 corpus and validated its speaker recognition ability on 31 held-out speakers, achieving extremely low error rates down to around 0.7% with a simple 1-NN classifier (Tables 1 and 2). The embedding vectors show appropriate clustering by speaker (Fig. 3), demonstrating strong speaker discriminability. However, when integrated as auxiliary input into a multi-speaker ultrasound-to-speech spectral estimator, the actual quantitative improvement in synthesis quality is marginal. The multi-speaker model's mean squared error in spectral estimation only improves slightly (from 0.669 to 0.653) with the addition of the x-vector embeddings (Table 3), a result consistent with prior reports. The system operates offline on the TaL80 dataset, and does not yet address critical practical issues such as cross-session robustness, probe placement variation, or fully speaker-independent synthesis, limiting immediate deployment readiness. Nonetheless, the paper provides a strong contribution in the area of speaker characterization from ultrasound video and highlights key challenges in extending this to multi-speaker SSI synthesis.",
      "expert_true_value": "The study firmly establishes that x-vector style embeddings can be trained successfully from ultrasound tongue video and generalize to unseen speakers, but reveals that naïve incorporation into SSI synthesis yields limited gains, indicating the need for more sophisticated integration methods and SSI modeling for speaker-independent multi-speaker operation.",
      "canon_before": "Ultrasound-based silent speech interfaces (SSI) have traditionally been speaker-dependent due to speaker-specific anatomical differences, and speaker conditioning usually relied on simpler speaker descriptors.",
      "delta_from_canon": "Trains a dedicated ultrasound-based x-vector network for speaker embedding and injects it as an auxiliary input to a multi-speaker ultrasound-to-speech spectral estimation network.",
      "position_in_field": "Speaker-representation study adjacent to multi-speaker ultrasound SSI research.",
      "practical_value": "Strong speaker embedding for ultrasound data; marginal evidence for improved multi-speaker SSI synthesis so far.",
      "axes_moved": "speaker representation; multi-speaker modeling",
      "axes_unresolved": "optimal integration of speaker embeddings into SSI synthesis; true speaker-independent multi-speaker synthesis; robustness to session variability and probe misalignment",
      "axes_regressed": "",
      "technical_limits": "Marginal multi-speaker synthesis gain likely due to suboptimal method of injecting speaker embeddings after convolutional layers; no robustness to session or probe changes tested.",
      "evaluation_limits": "Multi-speaker synthesis experiments are speaker-dependent with train, dev, and test sets drawn from the same 31-speaker subset; no fully speaker-independent SSI evaluation presented.",
      "deployment_limits": "No evidence of live deployment or robustness to session and probe shifts; no cross-session or probe-position variability handling demonstrated.",
      "scope_limits": "Ultrasound speaker representation with limited downstream multi-speaker speech synthesis evaluation on a controlled corpus.",
      "task": "speech-reconstruction",
      "input_modality": "ultrasound",
      "sensor_hardware": "Articulate Instruments’ Micro ultrasound system capturing 64x842 pixel mid-sagittal tongue images at 82 fps, resized to 64x128 for processing.",
      "body_site": "tongue",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Speaker recognition error rates (%); Mean squared error (MSE) in spectral estimation; Mel-cepstral distortion (MCD) derived from MSE was 3.12 for single speaker synthesis.",
      "evaluation_mode": "Speaker recognition on held-out speakers and ultrasound-to-spectrum speech synthesis error evaluation in single and multi-speaker settings.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We adjusted the x-vector framework popular in speech processing to operate with ultrasound tongue videos, producing speaker embedding vectors which we tested in multi-speaker ultrasound-to-speech conversion.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Speaker recognition error rates on development set for the 50-speaker training subset decreased to 1.96% with 164-frame (2 sec) segments (Table 1).",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "5. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Speaker recognition error rates for the held-out 31 speakers using 1-nearest neighbor leave-one-out testing are as low as 0.70% when embeddings are extracted from FC#2 without nonlinearity (Table 2).",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "5. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The mean squared error (MSE) for the ultrasound-to-spectrum spectral estimation task in the single-speaker setup is about 0.265 on the test set; in a multi-speaker setup without x-vector conditioning it rises to 0.669; adding x-vector conditioning marginally improves MSE to 0.653 (Table 3).",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "5. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The multi-speaker synthesis experiment is speaker-dependent as the training, development, and test sets come from the same 31 speakers, so full speaker-independence is not demonstrated; probe-shift and cross-session variation handling are not studied.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "5. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The method used to integrate the x-vector speaker embeddings into the SSI spectral estimation network is suboptimal, as simple concatenation is not possible and embeddings were injected only after convolutional layers, limiting gain and requiring further study.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "5. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "No probe-shift, session-variation, or live SSI deployment study is provided, and results remain offline and lab-bound, limiting deployment readiness.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces-neural-speaker-embeddings-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "6. Conclusions",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_an-improved-model-for-voicing-silent-speech",
      "slug": "an-improved-model-for-voicing-silent-speech",
      "title": "An Improved Model for Voicing Silent Speech",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "David Gaddy",
        "Dan Klein"
      ],
      "url": "https://nao-ki-mura.com/paper/an-improved-model-for-voicing-silent-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2106.01933",
      "arxiv_url": "https://arxiv.org/abs/2106.01933",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "modality:emg",
        "output:speech-audio",
        "task:speech-reconstruction",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:unseen-words"
      ],
      "expert_take_short": "This paper substantially improves open-vocabulary silent speech voicing using learned convolutional EMG features, Transformer modeling, and phoneme supervision, reducing WER from 68.0% to 42.2% automatic and 32.3% human in a single-speaker lab setting.",
      "expert_take_long": "This paper presents a noteworthy advancement in open-vocabulary silent speech voicing using facial EMG from a single speaker. By shifting from hand-designed EMG features to learned convolutional features, introducing a Transformer architecture for improved temporal context, and adding an auxiliary phoneme prediction loss, the authors achieve a substantial 25.8% absolute reduction in WER—from 68.0% to 42.2% automatic and further to 32.3% in human transcription. Ablation experiments convincingly show that each component contributes to performance gains. The phoneme confusion and articulatory feature analyses provide valuable insights into model errors, showing persistent challenges with voicing and nasality distinctions, consistent with prior findings. Despite these gains, the work's scope is currently limited to a single speaker and session-dependent setting without evaluation of speaker independence, multi-session robustness, or practical deployment considerations such as wearable stability or mobile use. Thus, while this represents a strong speaker-dependent EMG silent-speech reconstruction baseline and advances evaluation practices, substantial work remains to achieve robust, generalizable, and deployable silent speech prosthetics.",
      "expert_true_value": "Provides a strong step forward in speaker-dependent facial EMG silent speech reconstruction, demonstrating the value of learned features, Transformer architectures, and auxiliary phoneme prediction loss to substantially lower WER and improve intelligibility over prior work.",
      "canon_before": "Prior work used hand-crafted facial EMG features with recurrent LSTM-based models achieving 68.0% WER in open-vocabulary silent speech voicing.",
      "delta_from_canon": "Replaces hand-designed features with learned convolutional features, swaps LSTM with Transformer layers, and adds auxiliary phoneme prediction loss during training.",
      "position_in_field": "Strong speaker-dependent facial EMG silent speech reconstruction paper demonstrating large intelligibility improvements in open-vocabulary synthesis.",
      "practical_value": "Serves as a strong open-vocabulary facial EMG voicing baseline and demonstrates the efficacy of learned EMG feature extraction and auxiliary phoneme supervision; useful for single-speaker experimental research but not yet practical deployment.",
      "axes_moved": "Model quality and open-vocabulary intelligibility improved significantly; also adds detailed phoneme-level error analysis.",
      "axes_unresolved": "Speaker independence and session robustness remain unaddressed; wearable practicality and long-term use not evaluated.",
      "axes_regressed": "",
      "technical_limits": "Model trained on single speaker without addressing speaker independence or robustness across sessions; no wearable or real-world mobility evaluation; phoneme distinctions like voicing remain challenging.",
      "evaluation_limits": "Human evaluation on only 40 silent speech samples with two raters; no session-independent or cross-speaker evaluation; automatic WER metric validated but still limited.",
      "deployment_limits": "Single-speaker lab data only; no tests on wearable electrodes, mobility, or long-term recalibration; no cross-session or cross-speaker evaluation.",
      "scope_limits": "Single speaker, facial EMG signals, open-vocabulary silent speech voicing only.",
      "task": "speech-reconstruction",
      "input_modality": "emg",
      "sensor_hardware": "",
      "body_site": "face",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Primary metric is word error rate (WER) evaluated both automatically and by human transcription; ablations report WER values of 45.2%, 46.0%, and 51.7% when specific components are removed.",
      "evaluation_mode": "Automatic and human transcription evaluations using word error rate (WER) on open-vocabulary silent speech synthesis from facial EMG.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "Improves silent-speech voicing by replacing hand-designed EMG features with learned convolutional features, using a Transformer, and adding a phoneme auxiliary loss.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Our model improves WER absolute by 25.8% from prior 68.0% to 42.2% as measured by automatic transcription, with human transcription averaging 32.3% WER on held-out samples.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "section_or_location": "3 Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The gain comes from replacing hand-crafted EMG features by convolutional feature extraction, employing a Transformer encoder on top, and adding an auxiliary phoneme prediction loss to improve learning.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "section_or_location": "2 Model",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "All data is from a single English speaker with 19 hours of facial EMG recordings in silent and vocalized speech; evaluation includes open-vocabulary automatic WER and human transcription on 40 samples by two raters; no cross-speaker or multi-session tests reported.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "section_or_location": "3 Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The system is limited to a single speaker, no session or speaker independence evaluation, and lacks tests for wearable robustness, mobility, or long-term recalibration.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-improved-model-for-voicing-silent-speech-an-improved-model-for-voicing-silent-speech.txt",
          "section_or_location": "5 Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks",
      "slug": "voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks",
      "title": "Voice Activity Detection for Ultrasound-based Silent Speech Interfaces using Convolutional Neural Networks",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Amin Honarmandi Shandiz",
        "Laszlo Toth"
      ],
      "url": "https://nao-ki-mura.com/paper/voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks",
      "doi": "10.1007/978-3-030-83527-9_43",
      "doi_url": "https://doi.org/10.1007/978-3-030-83527-9_43",
      "arxiv_id": "2105.13718",
      "arxiv_url": "https://arxiv.org/abs/2105.13718",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:labels",
        "task:speech-enhancement",
        "evaluation:quantitative",
        "deployment:hands-free",
        "deployment:speaker-dependent"
      ],
      "expert_take_short": "Preprocessing paper, narrow but legitimate.",
      "expert_take_long": "The paper does not solve ultrasound SSI, but it does close a real preprocessing gap. The classifier reaches 85.2% test accuracy with 0.859 ROC AUC on the ultrasound speech/silence task, and the downstream experiment shows that keeping long silence can worsen MCD. The scope remains narrow because everything is single-speaker TaL1 and the gains are incremental rather than transformative.",
      "expert_true_value": "The result is modest but real: ultrasound-based speech-versus-silence detection works reasonably well on one speaker, and silence trimming slightly helps downstream SSI reconstruction.",
      "canon_before": "Ultrasound SSI systems usually assumed speech frames or used speech-side VAD labels without testing ultrasound-only VAD itself.",
      "delta_from_canon": "Adds an explicit ultrasound VAD stage and checks how silence removal affects articulatory-to-acoustic synthesis metrics.",
      "position_in_field": "Core ultrasound SSI preprocessing paper.",
      "practical_value": "Useful if an ultrasound SSI pipeline needs a simple front-end to drop silence before synthesis.",
      "axes_moved": "preprocessing; evaluation",
      "axes_unresolved": "cross-speaker robustness; real-time deployment; broader corpora",
      "axes_regressed": "",
      "technical_limits": "Single-speaker data and speech-derived labels limit the result.",
      "evaluation_limits": "No cross-speaker validation or live interactive test is provided.",
      "deployment_limits": "Practical robustness to probe shift, silence styles, and new users is unknown.",
      "scope_limits": "Speech-silence preprocessing for one ultrasound SSI setup.",
      "task": "speech/silence detection for ultrasound-based SSI",
      "input_modality": "ultrasound tongue images",
      "sensor_hardware": "ultrasound imaging system",
      "body_site": "tongue",
      "output_type": "speech/silence labels",
      "vocabulary_type": "",
      "vocabulary_size": "2 classes",
      "metrics": "Ultrasound VAD reaches 85.2% test accuracy, F1 0.9, and ROC AUC 0.859; with Conv3D+BiLSTM SSI, removing silence yields test MCD 3.05 versus 3.12 when keeping 180 ms silence",
      "evaluation_mode": "single-speaker TaL1 classification accuracy/AUC plus SSI reconstruction with and without silence removal",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The paper first shows silence handling affects SSI quality, then trains a CNN to separate speech and silent ultrasound tongue images.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks-voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-con.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks-voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-con.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "The ultrasound speech-silence classifier reaches 0.852 test accuracy, 0.9 F1, and 0.859 ROC AUC.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks-voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-con.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks-voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-con.txt",
          "section_or_location": "Table 5.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.98,
          "statement": "With ultrasound-based VAD and the Conv3D+BiLSTM SSI model, removing silence gives test MCD 3.05 versus 3.12 when keeping 180 ms silence.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks-voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-con.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-convolutional-neural-networks-voice-activity-detection-for-ultrasound-based-silent-speech-interfaces-using-con.txt",
          "section_or_location": "Table 7. Training the SSI system with removing or retaining silence from the data",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_speaker-disentanglement-in-video-to-speech-conversion",
      "slug": "speaker-disentanglement-in-video-to-speech-conversion",
      "title": "Speaker disentanglement in video-to-speech conversion",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Dan Oneata",
        "Adriana Stan",
        "Horia Cucu"
      ],
      "url": "https://nao-ki-mura.com/paper/speaker-disentanglement-in-video-to-speech-conversion",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2105.09652",
      "arxiv_url": "https://arxiv.org/abs/2105.09652",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:hands-free",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "The paper effectively makes speaker identity a controllable factor in multi-speaker video-to-speech synthesis by disentangling it from content, showing the trade-off between intelligibility and voice control on GRID corpus data.",
      "expert_take_long": "This work significantly advances video-to-speech synthesis by introducing explicit speaker identity conditioning through dedicated speaker inputs and adversarial disentanglement losses that remove speaker information from the visual front-end. Leveraging a strong ResNet+Tacotron2 baseline, it enables synthesis in multiple voices, including unseen speakers. The extensive evaluation on the GRID dataset demonstrates that the methods maintain or improve intelligibility while providing voice control. However, there remains a notable trade-off between intelligibility and speaker control for unseen speakers, and the method’s applicability is constrained by the fixed vocabulary and controlled recording conditions of GRID. Nevertheless, the study offers a valuable foundation for controllable lip-to-speech models and outlines important challenges for future deployments in real-world, spontaneous, or open-vocabulary settings.",
      "expert_true_value": "Provides an explicit mechanism to separate speaker identity from visual speech content enabling flexible multi-speaker voice control in video-to-speech conversion, overcoming limitations of previous single-speaker or entangled models.",
      "canon_before": "Prior video-to-speech methods assumed a single speaker or implicitly entangled speaker identity with content in visual features without explicit speaker control.",
      "delta_from_canon": "Reconceptualizes multi-speaker video-to-speech as a controllable speaker disentanglement task with explicit auxiliary speaker inputs and adversarial disentanglement losses.",
      "position_in_field": "A key reference in multi-speaker controllable video-to-speech synthesis demonstrating explicit speaker disentanglement mechanisms.",
      "practical_value": "Enables personalized and multi-speaker lip-to-speech synthesis with explicit voice control, promising for assistive technologies requiring speaker adaptation.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Trade-offs remain between intelligibility and speaker control on unseen speakers, with improved disentanglement often decreasing intelligibility.",
      "axes_regressed": "",
      "technical_limits": "Fixed vocabulary; trade-off between intelligibility and speaker control, especially on unseen speakers; no demonstration of open-vocabulary or spontaneous speech generalization.",
      "evaluation_limits": "Evaluated only on closed-vocabulary GRID corpus; intelligibility and speaker control evaluated primarily on synthetic unseen speaker setups without spontaneous or open-vocabulary speech tests.",
      "deployment_limits": "Limited to GRID dataset’s fixed vocabulary and read speech; performance degrades on unseen speakers; requires speaker identity or embedding at inference, limiting zero-shot naturalness and real-world open-vocabulary scenarios.",
      "scope_limits": "Only closed-vocabulary GRID dataset with read speech; unseen speaker testing relies on synthetic pairing, no real-world open-vocabulary evaluation.",
      "task": "video-to-speech synthesis with speaker control",
      "input_modality": "video (silent lip region) plus explicit speaker identity or speaker embedding",
      "sensor_hardware": "camera",
      "body_site": "face; lip",
      "output_type": "speech-audio",
      "vocabulary_type": "GRID fixed sentence grammar",
      "vocabulary_size": "52 words",
      "metrics": "On unseen speaker identity control, best results include WER around 38.9% with EER about 11.9% using gradient reversal model; speaker-independent linear model achieves WER 42.7% and EER 7.3%.",
      "evaluation_mode": "Objective metrics including WER, EER, STOI, PESQ, MCD; additional listening tests for intelligibility and speaker similarity; speaker embeddings for similarity evaluation.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We introduce a new video-to-speech architecture and extend it to multi-speaker scenario by augmenting the network with an additional speaker-related input feeding either a discrete identity or a speaker embedding, enabling control of the target voice and synthesis for unseen identities.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The model uses adversarial losses that dispel the identity from the video embeddings to better disentangle linguistic content and speaker identity.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "They augment a visual processing front-end (3D conv, ResNet, LSTM) and Tacotron2 decoder with a speaker embedding component injected into the decoder and a speaker classifier applied on visual features to remove speaker information and disentangle content and identity.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "section_or_location": "III. METHOD DESCRIPTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Experiments are conducted on the GRID corpus dataset with 34,000 samples from 34 speakers and a 52-word fixed vocabulary with fixed sentence grammar.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "section_or_location": "IV. EXPERIMENTAL RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On unseen speaker control, the best speaker identity normalized rev-grad model achieves WER 38.9% and EER 11.9%; a speaker independent linear rev-grad model obtains WER 42.7% and EER 7.3%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "section_or_location": "IV. EXPERIMENTAL RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "For speaker-dependent evaluation on 4 speakers, models achieve WER as low as 17.8% (speaker-dependent baseline) and STOI around 0.468, PESQ around 1.85, and MCD around 32 decibels.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "section_or_location": "IV. EXPERIMENTAL RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The methods are limited to the GRID closed vocabulary and read speech with evaluation on synthetic unseen speaker setups; no results on open vocabulary or spontaneous speech are provided.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "section_or_location": "V. CONCLUSIONS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "There is an observed trade-off between content intelligibility and speaker control, especially for unseen speakers, with improvements in disentanglement sometimes degrading intelligibility.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_speaker-disentanglement-in-video-to-speech-conversion-speaker-disentanglement-in-video-to-speech-conversion.txt",
          "section_or_location": "V. CONCLUSIONS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_improving-neural-silent-speech-interface-models-by-adversarial-training",
      "slug": "improving-neural-silent-speech-interface-models-by-adversarial-training",
      "title": "Improving Neural Silent Speech Interface Models by Adversarial Training",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Amin Honarmandi Shandiz",
        "László Tóth",
        "Gábor Gosztolya",
        "Alexandra Markó",
        "Tamás Gábor Csapó"
      ],
      "url": "https://nao-ki-mura.com/paper/improving-neural-silent-speech-interface-models-by-adversarial-training",
      "doi": "10.1007/978-3-030-76346-6_39",
      "doi_url": "https://doi.org/10.1007/978-3-030-76346-6_39",
      "arxiv_id": "2104.11601",
      "arxiv_url": "https://arxiv.org/abs/2104.11601",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "deployment:speaker-dependent",
        "evaluation:quantitative"
      ],
      "expert_take_short": "A clean, well-executed incremental advance using GAN loss to modestly improve articulatory-to-acoustic mapping from ultrasound, validated objectively on two single-speaker corpora.",
      "expert_take_long": "This paper demonstrates an incremental but methodologically sound improvement to ultrasound tongue imaging-based neural silent speech interfaces by adding adversarial training with a Patch-GAN discriminator to a 3D CNN generator. Tested on two single-speaker corpora (Hungarian and English), the combined MSE and adversarial loss yields consistent albeit small gains across multiple objective metrics (STOI, PESQ, MCD, SI-SDR etc.). Although there is no subjective listening test, the rigorous objective evaluation supports the claim that adversarial loss serves as a useful perceptual quality proxy improving articulatory-to-acoustic mappings. The contribution is primarily an improved training objective rather than novel model architecture or multi-speaker evaluation. Deployment is limited by speaker dependency and lack of real-time or robustness analysis.",
      "expert_true_value": "Demonstrates that combining adversarial loss with MSE training of a 3D CNN generator improves objective speech quality metrics in ultrasound-to-speech mapping, providing a justified but modest quality improvement without architectural novelty.",
      "canon_before": "Ultrasound-to-speech systems typically train models using MSE loss, which does not align well with perceptual speech quality; 3D CNN architectures were previously effective generators without adversarial losses.",
      "delta_from_canon": "Adds a Patch-GAN discriminator and adversarial loss combined with MSE to improve quality without changing the generator architecture, trained on synchronized ultrasound and speech spectral data.",
      "position_in_field": "Training-objective refinement for speaker-dependent ultrasound-based silent speech interfaces.",
      "practical_value": "Useful for existing 3D CNN ultrasound-to-speech systems seeking small improvements in objective quality metrics, but not immediately deployable for practical applications.",
      "axes_moved": "Training objective - introducing adversarial loss beyond conventional MSE regression; Evaluation metric consistency across multiple speech quality and intelligibility metrics.",
      "axes_unresolved": "Larger perceptual gains beyond incremental improvements; multi-speaker and real-world robustness; subjective listening test validation.",
      "axes_regressed": "",
      "technical_limits": "Limited training data per speaker; only single-speaker corpora; modest objective metric improvements; no subjective perceptual validation.",
      "evaluation_limits": "No subjective listening tests; limited to single-speaker datasets; only objective speech quality and intelligibility metrics reported.",
      "deployment_limits": "Speaker-dependent setup; no evidence of real-time operation or multi-speaker robustness; offline evaluation only.",
      "scope_limits": "Focus on speaker-dependent ultrasound to mel-spectrogram articulatory-to-speech reconstruction; no multi-speaker or real-time analysis.",
      "task": "speech-reconstruction",
      "input_modality": "ultrasound",
      "sensor_hardware": "Ultrasound probe (Micro system by Articulate Instruments Ltd.) positioned under the chin for tongue imaging in midsagittal plane.",
      "body_site": "tongue",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Mean squared error (MSE), mean R2 score, Short-Time Objective Intelligibility (STOI), extended STOI (ESTOI), Perceptual Evaluation of Speech Quality (PESQ), Scale-Invariant Signal-to-Distortion Ratio (SI-SDR), Signal-to-Distortion Ratio (SDR), Perceptual Metric for Speech Quality Evaluation (PMSQE), Mel-Cepstral Distortion (MCD).",
      "evaluation_mode": "Objective quality metrics comparing MSE vs GAN training on held-out test data for Hungarian and English corpora.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper applies GAN-style adversarial training to ultrasound-based articulatory-to-acoustic mapping and compares it against pure MSE training for silent speech interface models.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "We extend the conventional MSE training loss with an adversarial loss component provided by a Patch-GAN discriminator network to improve perceptual speech quality in articulatory-to-acoustic mapping from ultrasound data.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "section_or_location": "3 Generative Adversarial Networks for Articulatory-to-Acoustic Mapping",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Evaluation metrics include MSE, R2, STOI, ESTOI, PESQ, SI-SDR, SDR, PMSQE, and MCD, demonstrating consistent albeit slight improvements in all metrics for both Hungarian and English corpora when using GAN training versus MSE-only training.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "section_or_location": "5 Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Experiments used two single-speaker corpora: a Hungarian female speaker with 438 sentences and the TAL1 English subset with 1015 train, 50 dev, and 24 test utterances, focusing on speaker-dependent ultrasound-to-speech reconstruction.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "section_or_location": "4 Experimental Set-Up",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The system is limited by the lack of listening tests, multi-speaker evaluation, real-time operation evidence, speaker-independence, and practical deployment readiness.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "section_or_location": "5 Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "The training framework operates offline on lab-recorded single-speaker ultrasound data with no real-time demonstration or multi-speaker generalization.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improving-neural-silent-speech-interface-models-by-adversarial-training-improving-neural-silent-speech-interface-models-by-adversarial-training.txt",
          "section_or_location": "6 Conclusions",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces",
      "slug": "3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces",
      "title": "3D Convolutional Neural Networks for Ultrasound-Based Silent Speech Interfaces",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "László Tóth",
        "Amin Honarmandi Shandiz"
      ],
      "url": "https://nao-ki-mura.com/paper/3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces",
      "doi": "10.1007/978-3-030-61401-0_16",
      "doi_url": "https://doi.org/10.1007/978-3-030-61401-0_16",
      "arxiv_id": "2104.11532",
      "arxiv_url": "https://arxiv.org/abs/2104.11532",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "deployment:speaker-dependent"
      ],
      "expert_take_short": "Temporal context helps, but the evidence is a single-speaker vocoder-parameter study.",
      "expert_take_long": "The paper provides a solid but narrow contribution showing that, on a single-speaker ultrasound dataset, a (2+1)D 3D CNN architecture that processes spaced frames (stride s=6, about 300 ms context) outperforms both 2D CNN and a more complex CNN+LSTM model in regression of vocoder coefficients (13 LSP parameters without F0). The 3D CNN yields test MSE of 0.315 and mean R2 of 0.683 compared to 0.366/0.633 for 2D CNN and 0.336/0.661 for CNN+LSTM. The study is limited to speaker-dependent regression, no pitch modeling, no listening tests, and one female Hungarian speaker. The method shows promise as a simpler temporal front-end for ultrasound SSI but requires more robust evaluation and evaluation of perceptual quality before deployment.",
      "expert_true_value": "Demonstrates that a compact (2+1)D 3D CNN outperforms 2D CNN and CNN+LSTM models for ultrasound video to speech vocoder parameter regression on a standard dataset, improving regression metrics while reducing model complexity and training time.",
      "canon_before": "Ultrasound SSI systems often processed frames independently or used heavier recurrent stacks to add temporal context.",
      "delta_from_canon": "Uses spaced temporal context inside a compact 3D CNN instead of a recurrent sequence model.",
      "position_in_field": "Method paper for speaker-dependent ultrasound-to-acoustic mapping.",
      "practical_value": "Useful as a simpler temporal front-end for ultrasound SSI models when the task is frame-aligned regression of vocoder acoustic features.",
      "axes_moved": "algorithm design; model efficiency",
      "axes_unresolved": "cross-speaker robustness; perceptual quality; silent articulation",
      "axes_regressed": "",
      "technical_limits": "Single-speaker data; models 13 vocoder coefficients without pitch; no silent articulation or real-time deployment tested; no subjective quality evaluation.",
      "evaluation_limits": "Only MSE and R2 are reported; there are no listening tests or intelligibility evaluations.",
      "deployment_limits": "No silent articulation scenario, no real-time study, and no robustness-to-probe-shift analysis.",
      "scope_limits": "Speaker-dependent direct speech synthesis from read-aloud ultrasound, not general SSI deployment.",
      "task": "speech-reconstruction",
      "input_modality": "ultrasound",
      "sensor_hardware": "Micro ultrasound system by Articulate Instruments Ltd. with 2-4 MHz 64-element convex transducer at 82 fps",
      "body_site": "tongue",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Mean squared error (MSE) and mean R2 coefficient for 13 vocoder parameters predicting Mel-Generalized Cepstral Coefficients (LSP representation) without pitch (F0).",
      "evaluation_mode": "Development/test objective comparison across FCN, 2D CNN, 3D CNN with various temporal strides, and re-trained CNN+LSTM under matched parameter count.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper claims that a compact (2+1)D 3D CNN is a better ultrasound-to-acoustic regressor than 2D CNN and CNN+LSTM baselines.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Ultrasound tongue videos from one Hungarian female speaker reading 438 sentences were collected, split as 310 train, 41 dev, 87 test.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "3 Data Acquisition",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The model predicts 13 vocoder coefficients representing Mel-Generalized Cepstral Coefficients converted to Line Spectral Pairs (LSPs) standardized per coefficient; pitch (F0) is not estimated.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "3 Data Acquisition",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The 3D CNN processes sequences of 5 ultrasound frames spaced s frames apart; best stride s=6 corresponds to ~300 ms context.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "4 Experimental Set-Up",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "3D CNN with s=6 achieves MSE 0.315 and mean R2 0.683 on test set; 2D CNN MSE 0.366 R2 0.633; CNN+LSTM MSE 0.336 R2 0.661.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "5 Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The study is limited to one speaker, 13 vocoder coefficients without pitch, no listening tests, no silent articulation scenario or real-time testing.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "6 Conclusions",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "3D CNN model is smaller and trains faster than CNN+LSTM with five times more parameters; 3D CNN is a viable alternative for ultrasound SSI video regression but not ready for real-world deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces-3d-convolutional-neural-networks-for-ultrasound-based-silent-speech-interfaces.txt",
          "section_or_location": "6 Conclusions",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation",
      "slug": "htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation",
      "title": "HTMD-Net: A Hybrid Masking-Denoising Approach to Time-Domain Monaural Singing Voice Separation",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Christos Garoufis",
        "Athanasia Zlatintsi",
        "Petros Maragos"
      ],
      "url": "https://nao-ki-mura.com/paper/htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2103.04336",
      "arxiv_url": "https://arxiv.org/abs/2103.04336",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "medium-high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:audio",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "expert_take_short": "Solid time-domain music vocal separation paper with a novel hybrid masking-denoising design showing improved silent-segment suppression; not relevant to SSI applications.",
      "expert_take_long": "HTMD-Net introduces a hybrid masking-denoising architecture for time-domain monaural singing voice separation that leverages a masking network to obtain an initial source estimate and a denoising network with skip connections to refine it. Trained and evaluated on the MUSDB18 dataset under various loss functions and deep supervision settings, HTMD-Net achieves competitive separation metrics compared to Conv-TasNet and Wave-U-Net, with especially improved behavior during silent vocal segments, as measured by predicted energy at silence and vocal activity detection. It has a smaller parameter footprint and achieves faster inference times than the Conv-TasNet baseline. Despite these merits, the work strictly pertains to music source separation and lacks direct relevance or application to silent speech interfaces (SSI). Thus, while it is a solid contribution within audio source separation, it should be considered out of scope for the SSI field.",
      "expert_true_value": "This paper advances monaural singing voice separation by integrating masking and denoising networks with deep supervision to improve silent-segment handling, setting competitive benchmark results on MUSDB18 but offers no direct SSI contribution.",
      "canon_before": "Time-domain source-separation models often choose between masking or denoising formulations and can behave poorly on silent vocal segments.",
      "delta_from_canon": "Combines masking and denoising modules serially with deep supervision, explicitly addressing silent-segment behavior.",
      "position_in_field": "Out-of-scope audio source-separation paper included as a distractor in SSI archive.",
      "practical_value": "Relevant to music vocal source separation research; lacks SSI application or interface value.",
      "axes_moved": "audio-source-separation evaluation",
      "axes_unresolved": "subjective quality; SSI relevance",
      "axes_regressed": "",
      "technical_limits": "Architecture and experiments limited to music singing voice separation; no adaptation or justification for SSI or other speech separation tasks.",
      "evaluation_limits": "Evaluation limited to MUSDB18 dataset, objective metrics only, no subjective listening tests reported.",
      "deployment_limits": "No silent-speech use case or deployment scenario discussed.",
      "scope_limits": "Monaural music singing voice separation only; no speech or SSI domain.",
      "task": "audio source separation",
      "input_modality": "acoustic",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Median and mean SDR, SIR, SAR at song-wise and segment-wise levels, plus predicted energy at silence (PES) and vocal activity detection accuracy (VAD)",
      "evaluation_mode": "Quantitative evaluation on MUSDB18 with statistical significance testing and silent-segment metrics (PES, VAD).",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "Proposes a hybrid time-domain approach, termed the HTMD-Net, combining a lightweight masking component and a denoising module based on skip connections to refine the source estimated by masking, achieving competitive monaural singing voice separation with better silent-segment suppression and computational efficiency.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "HTMD-Net architecture consists of two serially connected modules: an initial mask estimator applying a mask on a latent mixture representation, followed by a denoising network with skip connections refining the estimate; deep supervision is applied over the intermediate and final outputs.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "section_or_location": "II. M ETHODOLOGY",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluation conducted on the MUSDB18 monaural singing voice separation benchmark, with 100 training songs and 50 test songs downsampled to 22.05 kHz mono; metrics include SDR, SIR, SAR at song and segment levels, plus predicted energy at silence (PES) and vocal activity detection accuracy (VAD).",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "section_or_location": "III. E XPERIMENTAL S ETUP",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Performance metrics include song-wise median SDR (5.16 dB with MSE loss), SIR (10.24 dB), SAR (8.53 dB), and segment-wise predicted energy at silence (PES) and vocal activity detection accuracy (VAD) (e.g. VAD 84.7% for HTMD-Net with MSE/MSE training).",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "section_or_location": "IV. R ESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.9,
          "statement": "Statistical significance tests (Wilcoxon signed-rank and McNemar’s tests) establish HTMD-Net performs comparably to Conv-TasNet on SDR but better in silent segment handling (lower PES, higher VAD). HTMD-Net outperforms Wave-U-Net significantly on all metrics.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "section_or_location": "IV. R ESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.8,
          "statement": "The paper focuses solely on monaural singing voice separation for music and does not discuss deployment or use cases within silent speech interfaces or related fields.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "section_or_location": "V. C ONCLUSIONS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The architecture is designed and tuned specifically for music source separation and lacks exploration or applicability to SSI; evaluation is limited to MUSDB18 with no subjective listening tests reported.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voice-separation-htmd-net-a-hybrid-masking-denoising-approach-to-time-domain-monaural-singing-voi.txt",
          "section_or_location": "V. C ONCLUSIONS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video",
      "slug": "silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video",
      "title": "Silent versus modal multi-speaker speech recognition from ultrasound and video",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Manuel Sam Ribeiro",
        "Aciel Eshky",
        "Korin Richmond",
        "Steve Renals"
      ],
      "url": "https://nao-ki-mura.com/paper/silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2103.00333",
      "arxiv_url": "https://arxiv.org/abs/2103.00333",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:tongue",
        "modality:multimodal",
        "output:text",
        "task:speech-recognition",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "deployment:hands-free",
        "deployment:speaker-independent"
      ],
      "expert_take_short": "Large-corpus baseline with real silent-mode gap.",
      "expert_take_long": "The paper is valuable because it does not hide the hard part. Even on the larger TaL80 setup, silent WER remains much worse than modal WER, and adaptation only partly closes the gap. That is a credible result, not a failure: it shows exactly where multi-speaker ultrasound SSI breaks. The secondary analysis matters too, because it confirms that silent speech is slower and occupies a smaller articulatory space, which helps explain why modal-trained recognizers transfer poorly.",
      "expert_true_value": "The strongest contribution is not headline WER but the diagnosis that silent speech is slower, occupies a smaller articulatory space, and benefits from adaptation but remains far harder than modal speech.",
      "canon_before": "Ultrasound-plus-video SSI work was often small-scale, single-speaker, and weak on speaking-mode mismatch.",
      "delta_from_canon": "Moves to an 82-speaker corpus and treats silent-versus-modal mismatch as a domain adaptation problem with articulatory analysis alongside recognition.",
      "position_in_field": "Core multi-speaker ultrasound SSI recognition paper.",
      "practical_value": "Useful as a realistic benchmark for silent-versus-modal mismatch and for judging what classical adaptation buys on larger ultrasound corpora.",
      "axes_moved": "benchmark scale; evaluation; adaptation analysis",
      "axes_unresolved": "real-time interaction; robustness to probe shift; in-the-wild generalization",
      "axes_regressed": "",
      "technical_limits": "Performance is still far from usable for many silent conditions, and the pipeline relies on careful ultrasound capture plus external adaptation tricks.",
      "evaluation_limits": "No live interface evaluation or end-user task is reported.",
      "deployment_limits": "Portable deployment remains unclear because the system still depends on probe placement and corpus-style recording conditions.",
      "scope_limits": "Recognition from ultrasound and video under controlled corpus recording.",
      "task": "speech recognition from ultrasound and lip video",
      "input_modality": "ultrasound tongue imaging and lip video",
      "sensor_hardware": "ultrasound probe and camera",
      "body_site": "lip; tongue",
      "output_type": "text",
      "vocabulary_type": "open-vocabulary ASR",
      "vocabulary_size": "",
      "metrics": "On TaL80 multi-speaker, silent WER drops from 77.79 raw to 69.84 with fMLLR plus unsupervised adaptation, while modal WER is 39.34 raw; on TaL1 speaker-dependent, silent WER falls from 52.64 raw to 37.94",
      "evaluation_mode": "TaL80 multi-speaker and TaL1 speaker-dependent WER across modal, silent, and whispered speech with articulatory-space analysis",
      "evidence": [
        {
          "claim_type": "validation_scope",
          "confidence": 0.99,
          "statement": "The TaL corpus contains six sessions from one professional speaker in TaL1 and single-session recordings from 81 additional native English speakers in TaL80.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video-silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video-silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video.txt",
          "section_or_location": "2. The TaL corpus",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "On TaL80 multi-speaker evaluation, silent WER improves from 77.79 raw to 69.84 with fMLLR plus unsupervised adaptation, while modal raw WER is 39.34.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video-silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video-silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video.txt",
          "section_or_location": "Table 1: Word error rate on modal, silent, and whispered speech",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.98,
          "statement": "The analysis finds significant speaking-mode differences in syllable rate and convex-hull articulatory space, but those differences do not directly correlate with WER differences.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video-silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video-silent-versus-modal-multi-speaker-speech-recognition-from-ultrasound-and-video.txt",
          "section_or_location": "4. Analysis",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system",
      "slug": "ema2s-an-end-to-end-multimodal-articulatory-to-speech-system",
      "title": "EMA2S: An End-to-End Multimodal Articulatory-to-Speech System",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yu-Wen Chen",
        "Kuo-Hsuan Hung",
        "Shang-Yi Chuang",
        "Jonathan Sherman",
        "Wen-Chin Huang",
        "Xugang Lu",
        "Yu Tsao"
      ],
      "url": "https://nao-ki-mura.com/paper/ema2s-an-end-to-end-multimodal-articulatory-to-speech-system",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2102.03786",
      "arxiv_url": "https://arxiv.org/abs/2102.03786",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:jaw",
        "body_site:lip",
        "body_site:tongue",
        "modality:magnetic",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative"
      ],
      "expert_take_short": "EMA2S achieves consistent quality improvements over prior EMA-to-speech baselines by combining multimodal joint loss training with a neural vocoder, though gains remain confined to lab EMA conditions.",
      "expert_take_long": "EMA2S integrates a spectral encoder, EMA encoder, and a shared decoder, training jointly to minimize losses on spectrogram, mel-spectrogram, and a deep feature loss measuring embedding similarity between articulatory and acoustic modalities. The use of a neural vocoder (Parallel WaveGAN) marks a departure from traditional parametric vocoders and yields demonstrable benefit. Evaluated on the NTT EMA corpus with three speakers, EMA2S outperforms a strong BLSTM-based baseline on MCD (7.815 to 7.176), PESQ (1.279 to 1.350), STOI (0.696 to 0.716), and CCR (0.818 to 0.868). In an A/B listening test with 10 participants, EMA2S was preferred 83% of the time. A reduced four-EMA-sensor variant also improves over baseline, indicating potential for reduced sensor setups. Despite these gains, the hardware remains intrusive laboratory EMA, limiting deployment practicalities, and the dataset size and speaker count constrain generalizability. The study contributes a rigorous multimodal joint-loss EMA-to-waveform synthesis pipeline with verified gains over prior methods but does not close the gap to wearable silent speech interfaces.",
      "expert_true_value": "Provides a more natural and intelligible articulatory-to-speech synthesis baseline by effectively leveraging a neural vocoder and multimodal joint loss, improving reconstruction quality beyond prior parametric vocoder pipelines.",
      "canon_before": "EMA-to-speech systems typically relied on parametric vocoders with a single acoustic loss, limiting naturalness and intelligibility.",
      "delta_from_canon": "Replaces traditional parametric vocoders with an end-to-end neural vocoder architecture (Parallel WaveGAN) and incorporates a combined loss over spectrogram, mel-spectrogram, and deep features for improved articulatory-to-speech mapping.",
      "position_in_field": "EMA-based articulatory-to-speech synthesis with the integration of modern neural vocoding and multimodal loss training, showing incremental but well-documented gains.",
      "practical_value": "Serves as a significant improved lab baseline for EMA articulatory-to-speech synthesis; useful for research on data-efficient, multimodal articulatory representations but limited for immediate practical use due to hardware requirements.",
      "axes_moved": "System design; reconstruction quality improved via neural vocoder and joint multimodal loss training.",
      "axes_unresolved": "Hardware practicality; cross-speaker robustness; naturalness ceiling remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "Small dataset size (three speakers, 354 utterances each), dependence on intrusive EMA sensors, and lack of large-scale or diverse speaker evaluation.",
      "evaluation_limits": "Evaluations are confined to a small dataset with only three speakers; the subjective A/B listening test involved only 10 participants with 15 questions total, and no cross-speaker generalization testing was reported.",
      "deployment_limits": "The system requires laboratory-grade EMA hardware, which is intrusive and limits portability. Even the reduced four-sensor configuration still depends on EMA instrumentation and controlled environment recording conditions.",
      "scope_limits": "Focused on laboratory EMA articulatory-to-speech synthesis with neural vocoder reconstruction; not addressing wearable or silent speech device deployment.",
      "task": "speech-reconstruction",
      "input_modality": "magnetic (EMA)",
      "sensor_hardware": "Electromagnetic midsagittal articulography (EMA) with nine sensors including lips, jaw, and multiple tongue positions.",
      "body_site": "jaw; lip; tongue",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Mel-cepstral distortion, PESQ, short-time objective intelligibility, character correct rate by ASR, and subjective A/B listening preference percentages.",
      "evaluation_mode": "Objective evaluation metrics (mel-cepstral distortion, PESQ, STOI, character correct rate with pre-trained ASR) combined with subjective A/B listening preference test and ablation with fewer sensors.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We present EMA2S, an end-to-end multimodal articulatory-to-speech system that directly converts articulatory movements to speech signals using a neural-network-based vocoder combined with multimodal joint-training of spectrogram, mel-spectrogram, and deep features.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "EMA2S uses a combined multimodal loss consisting of spectrogram loss, mel-spectrogram loss, and deep feature loss, leveraging the idea of multimodal learning by calculating dissimilarity between articulatory and spectral embeddings during training.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "section_or_location": "III. PROPOSED METHOD",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The system replaces traditional parametric vocoders with a Parallel WaveGAN-based neural vocoder, allowing an end-to-end trainable articulatory-to-speech waveform synthesis pipeline.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "section_or_location": "III. PROPOSED METHOD",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Objective evaluation metrics used include mel-cepstral distortion (MCD), perceptual evaluation of speech quality (PESQ), short-time objective intelligibility (STOI), and character correct rate (CCR) from a pre-trained automatic speech recognition (ASR) system.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Experiments performed on the NTT EMA corpus with three speakers, each contributing 354 utterances, split into 304 training and 50 testing utterances per speaker, with sensors placed on upper lip, lower lip, upper jaw, lower jaw, tongue tip, tongue blade, tongue dorsum, tongue rear, and velum.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "The EMA2S system achieves superior objective scores compared to the baseline: MCD 7.176 vs 7.815, PESQ 1.350 vs 1.279, STOI 0.716 vs 0.696, and CCR 0.868 vs 0.818, as shown in Table I of the paper.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "The subjective A/B listening test with 10 participants and 15 questions in total shows that EMA2S speech output was preferred 83% over the baseline's 17%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The dataset is limited to three speakers and 354 utterances each, recorded in laboratory conditions with EMA sensors, which are intrusive and impractical for general deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "Even with only four less invasive EMA sensors, the system still outperforms the baseline, but still depends on EMA hardware and controlled conditions, limiting portability and real-world deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ema2s-an-end-to-end-multimodal-articulatory-to-speech-system-ema2s-an-end-to-end-multimodal-articulatory-to-speech-system.txt",
          "section_or_location": "IV. EXPERIMENTS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image",
      "slug": "convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image",
      "title": "Convolutional Neural Network-Based Age Estimation Using B-Mode Ultrasound Tongue Image",
      "year": 2021,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Kele Xu",
        "Tamas Gabor Csapo",
        "Ming Feng"
      ],
      "url": "https://nao-ki-mura.com/paper/convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2101.11245",
      "arxiv_url": "https://arxiv.org/abs/2101.11245",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:labels",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Real signal, wrong target for SSI.",
      "expert_take_long": "The paper is honest proof-of-concept work. It shows the CNN beats mean-age baselines on both child cohorts, reaching validation MSE 2.03 on UXTD and 4.87 on UPX, so the images do carry age-related information. But the contribution is not to silent speech or speech reconstruction; it is a side-channel articulatory analysis result whose main value is methodological and clinical rather than interface-oriented.",
      "expert_true_value": "The full text supports only a narrow claim: age-related signal is present in child tongue ultrasound images, but the paper is an exploratory regression study rather than an SSI system contribution.",
      "canon_before": "Ultrasound tongue imaging in SSI work is usually used for articulatory analysis, speech therapy, or articulatory-to-acoustic mapping rather than demographic regression.",
      "delta_from_canon": "Recasts ultrasound tongue imaging as a speaker-age regression problem and treats age as a hidden signal in articulatory images.",
      "position_in_field": "Adjacent ultrasound analysis paper outside the core SSI pipeline literature.",
      "practical_value": "Useful if a speech-therapy workflow wants age-sensitive ultrasound analysis, but not useful as a communication interface result.",
      "axes_moved": "analysis; evaluation",
      "axes_unresolved": "generalization; adult speakers; cross-site robustness",
      "axes_regressed": "",
      "technical_limits": "Small child-only datasets, low-SNR ultrasound, and no evidence beyond two UltraSuite cohorts limit the result.",
      "evaluation_limits": "The reported success is validation-set regression error only; there is no external dataset or deployment test.",
      "deployment_limits": "No interface, runtime, or user-facing system is built.",
      "scope_limits": "Exploratory age estimation from tongue ultrasound rather than silent speech.",
      "task": "age estimation from ultrasound tongue images",
      "input_modality": "ultrasound tongue image",
      "sensor_hardware": "",
      "body_site": "tongue",
      "output_type": "age labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Validation MSE 2.03 on UXTD with random rotation; validation MSE 4.87 on UPX; mean-age baselines 3.64 and 5.35",
      "evaluation_mode": "validation MSE comparison on UXTD and UPX child cohorts against mean-age baselines",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The paper explores whether speaker age can be inferred from ultrasound tongue imaging using a deep convolutional neural network.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image-convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image-convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.99,
          "statement": "For typically developing children, the minimum validation MSE is about 2.03, compared with a mean-age baseline of 3.64.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image-convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image-convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue.txt",
          "section_or_location": "4.4. Experiments Results for Typically Developing Children",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.98,
          "statement": "The authors state the ultrasound dataset is much smaller than speech-based age datasets and that performance needs validation on larger data.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image-convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue-image-convolutional-neural-network-based-age-estimation-using-b-mode-ultrasound-tongue.txt",
          "section_or_location": "5. Conclusions",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_end-to-end-silent-speech-recognition-with-acoustic-sensing",
      "slug": "end-to-end-silent-speech-recognition-with-acoustic-sensing",
      "title": "End-to-end Silent Speech Recognition with Acoustic Sensing",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/end-to-end-silent-speech-recognition-with-acoustic-sensing",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2011.11315",
      "arxiv_url": "https://arxiv.org/abs/2011.11315",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "modality:acoustic",
        "modality:microphone",
        "task:speech-recognition",
        "output:text",
        "deployment:mobile-suitable",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:unseen-words"
      ],
      "expert_take_short": "Strong mobile-friendly acoustic SSI paper.",
      "expert_take_long": "The full text supports a substantial SSI claim: with phase and double-delta features plus an attention decoder, the system can recognize silent sentences from active acoustic reflections at nontrivial accuracy across users and environments. The domain-independent and unseen-sentence numbers are strong enough to make the modality credible, especially given the non-invasive smartphone-style hardware. The remaining limitation is scope: the dataset is still small, vocabulary coverage is only 54 sentences, and the paper does not prove real-world always-on deployment.",
      "expert_true_value": "Real SSI contribution: it shows commodity acoustic sensing can support non-invasive silent speech recognition beyond same-user laboratory memorization.",
      "canon_before": "Acoustic sensing for silent speech mostly targeted simpler gesture-style lip sensing and often relied on hand-crafted pipelines rather than sentence-level end-to-end recognition.",
      "delta_from_canon": "Builds a sentence-level silent speech recognizer from reflected acoustic phase features and shows cross-domain and unseen-sentence performance on a smartphone-like setup.",
      "position_in_field": "Core acoustic-sensing SSI paper focused on mobile-compatible silent speech recognition.",
      "practical_value": "Important evidence that smartphone-class active acoustics can act as a silent-speech sensor without worn articulatory hardware.",
      "axes_moved": "modality; deployment; evaluation",
      "axes_unresolved": "Larger vocabulary; field robustness; continuous use on commodity devices",
      "axes_regressed": "",
      "technical_limits": "Vocabulary remains small, evaluation is on a collected research dataset, and practical latency or always-on robustness are not established.",
      "evaluation_limits": "Only 54 sentences are covered, and the unseen-sentence split still operates inside that limited corpus design.",
      "deployment_limits": "Promising for smart devices, but field robustness, power cost, and broader-vocabulary behavior remain unresolved.",
      "scope_limits": "Acoustic silent speech recognition from lip movements only.",
      "task": "speech-recognition",
      "input_modality": "acoustic",
      "sensor_hardware": "smartphone speaker and microphone",
      "body_site": "lip",
      "output_type": "text",
      "vocabulary_type": "54 fixed sentences with unseen-sentence split",
      "vocabulary_size": "54 sentences",
      "metrics": "WER is 2.6% in domain-dependent testing, 8.4% average in domain-independent testing, and 8.1% in unseen-sentence testing; the worst unseen-sentence WER shown in the Top-10 list is 18.2%",
      "evaluation_mode": "domain-dependent, domain-independent, unseen-sentence, and CTC comparison WER evaluation",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.98,
          "statement": "The paper proposes a non-invasive silent speech recognition method using inaudible acoustic signals generated by smart devices for lip-reading.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_end-to-end-silent-speech-recognition-with-acoustic-sensing-end-to-end-silent-speech-recognition-with-acoustic-sensing.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_end-to-end-silent-speech-recognition-with-acoustic-sensing-end-to-end-silent-speech-recognition-with-acoustic-sensing.txt",
          "section_or_location": "5. CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.98,
          "statement": "The conclusion reports WERs of 2.6% for domain-dependent testing, 8.4% for domain-independent testing, and 8.1% for unseen-sentence testing.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_end-to-end-silent-speech-recognition-with-acoustic-sensing-end-to-end-silent-speech-recognition-with-acoustic-sensing.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_end-to-end-silent-speech-recognition-with-acoustic-sensing-end-to-end-silent-speech-recognition-with-acoustic-sensing.txt",
          "section_or_location": "5. CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.96,
          "statement": "Section 4.3 states that the unseen-sentence evaluation covers 54 sentences with leave-one-sentence-out validation, and the worst Top-10 WER shown is 18.2%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_end-to-end-silent-speech-recognition-with-acoustic-sensing-end-to-end-silent-speech-recognition-with-acoustic-sensing.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_end-to-end-silent-speech-recognition-with-acoustic-sensing-end-to-end-silent-speech-recognition-with-acoustic-sensing.txt",
          "section_or_location": "4.3. Evaluation and Performance",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_speech-prediction-in-silent-videos-using-variational-autoencoders",
      "slug": "speech-prediction-in-silent-videos-using-variational-autoencoders",
      "title": "Speech Prediction in Silent Videos using Variational Autoencoders",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ravindra Yadav",
        "Ashish Sardana",
        "Vinay P Namboodiri",
        "Rajesh M Hegde"
      ],
      "url": "https://nao-ki-mura.com/paper/speech-prediction-in-silent-videos-using-variational-autoencoders",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2011.07340",
      "arxiv_url": "https://arxiv.org/abs/2011.07340",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "expert_take_short": "Strong video-to-speech paper that models ambiguity explicitly.",
      "expert_take_long": "The full text supports the central claim: the paper is trying to solve multimodality, not merely squeeze a few points from a deterministic baseline. Table 1 shows the result is nuanced rather than absolute domination, with better ESTOI and PESQ than prior spectrogram-based systems but not the top STOI overall. The real contribution is Section 4.3, where diverse outputs from the same silent clip justify the variational framing.",
      "expert_true_value": "A meaningful step for video-to-speech because it attacks one-to-many ambiguity directly, though the gains are concentrated in quality metrics rather than a clean sweep of all measures.",
      "canon_before": "Prior silent-video speech systems mostly assumed a deterministic mapping from lip movements to audio.",
      "delta_from_canon": "This paper uses a variational formulation to model uncertainty and generate multiple plausible audio realizations for the same video.",
      "position_in_field": "Core video-based speech reconstruction work within SSI-adjacent silent-video research.",
      "practical_value": "Useful when the target is plausible speech synthesis from silent video rather than text decoding.",
      "axes_moved": "multimodality_modeling; video_to_speech; diversity",
      "axes_unresolved": "speaker generalization, higher-fidelity vocoding, and deployment remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "Waveform recovery still depends on Griffin-Lim and the benchmark remains constrained to GRID.",
      "evaluation_limits": "No open-world or speaker-independent deployment evaluation is provided in the extracted text.",
      "deployment_limits": "No real-time or in-the-wild deployment story is established.",
      "scope_limits": "Silent-video speech reconstruction only.",
      "task": "speech-reconstruction",
      "input_modality": "silent lip video",
      "sensor_hardware": "camera",
      "body_site": "face; lip",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "On GRID the proposed model reports STOI 0.724, ESTOI 0.540, and PESQ 1.932; it trails Lip2Wav on STOI by 0.007 but leads on ESTOI and PESQ.",
      "evaluation_mode": "GRID benchmark with STOI, ESTOI, PESQ, qualitative comparison, and diversity sampling",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract says the paper presents a stochastic model for generating speech from silent video instead of assuming a deterministic one-to-one mapping.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_speech-prediction-in-silent-videos-using-variational-autoencoders-speech-prediction-in-silent-videos-using-variational-autoencoders.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_speech-prediction-in-silent-videos-using-variational-autoencoders-speech-prediction-in-silent-videos-using-variational-autoencoders.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.96,
          "statement": "Table 1 reports the proposed model at STOI 0.724, ESTOI 0.540, and PESQ 1.932 on GRID.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_speech-prediction-in-silent-videos-using-variational-autoencoders-speech-prediction-in-silent-videos-using-variational-autoencoders.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_speech-prediction-in-silent-videos-using-variational-autoencoders-speech-prediction-in-silent-videos-using-variational-autoencoders.txt",
          "section_or_location": "4.1. Quantitative Evaluation",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.94,
          "statement": "Section 4.3 argues that varying the latent variable z produces multiple plausible speech waveforms for the same input video, supporting the multimodality claim.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_speech-prediction-in-silent-videos-using-variational-autoencoders-speech-prediction-in-silent-videos-using-variational-autoencoders.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_speech-prediction-in-silent-videos-using-variational-autoencoders-speech-prediction-in-silent-videos-using-variational-autoencoders.txt",
          "section_or_location": "4.3. Diverse Predictions",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.91,
          "statement": "The model reconstructs time-domain audio via Griffin-Lim from mel spectrograms, which keeps the pipeline in the spectrogram-reconstruction regime rather than end-to-end waveform vocoding.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_speech-prediction-in-silent-videos-using-variational-autoencoders-speech-prediction-in-silent-videos-using-variational-autoencoders.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_speech-prediction-in-silent-videos-using-variational-autoencoders-speech-prediction-in-silent-videos-using-variational-autoencoders.txt",
          "section_or_location": "4.1. Quantitative Evaluation",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network",
      "slug": "x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network",
      "title": "X-TaSNet: Robust and Accurate Time-Domain Speaker Extraction Network",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zining Zhang",
        "Bingsheng He",
        "Zhenjie Zhang"
      ],
      "url": "https://nao-ki-mura.com/paper/x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2010.12766",
      "arxiv_url": "https://arxiv.org/abs/2010.12766",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement"
      ],
      "expert_take_short": "Strong time-domain target-speaker extraction using speaker verification and innovative training; improves robustness to absent target but remains speech extraction, not silent speech.",
      "expert_take_long": "X-TaSNet advances the state of speaker extraction by integrating a pretrained speaker verification module with a time-domain Conv-TaSNet architecture, enhancing robustness to absent target speakers via a distortion-based auxiliary loss and an alternating training scheme. The authors demonstrate approximately doubling SDRi and SI-SNRi over Voicefilter baseline while achieving higher speaker identity accuracy (up to 95.4%) and novel absent-speaker detection metrics (72.4% NER with SPIT). Though evaluation is limited to two-speaker mixtures and clean conditions, these contributions move toward more reliable, practical target-speaker extraction. However, the task remains speech extraction, not silent speech interface. Absent-speaker detection remains imperfect, limiting real-world deployment readiness.",
      "expert_true_value": "The key advance is robust target speaker extraction with explicit handling of absent speakers via integration of speaker verification and training strategies rather than assuming known speaker count or always-present target speaker.",
      "canon_before": "Time-domain speech separation methods like TasNet are effective but assume known speaker count and do not perform reliable speaker extraction especially when the target may be absent.",
      "delta_from_canon": "The method explicitly integrates speaker verification embeddings into a time-domain extraction network, uses distortion-based loss and alternating training, and handles absent-speaker scenarios improving robustness and extraction accuracy.",
      "position_in_field": "A useful benchmark advancing robust target-speaker extraction but outside silent speech interfaces.",
      "practical_value": "Relevant for improving diarization, communication filtering, and speaker extraction tasks requiring known or unknown speaker counts with absent-target robustness.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Absent-speaker detection remains imperfect and the task is limited to speech extraction, not silent speech interfaces.",
      "axes_regressed": "",
      "technical_limits": "Requires clean reference audio; absent-speaker detection limited to 72.4% accuracy; evaluated only on clean two-speaker mixtures; output not suitable for silent speech interfaces.",
      "evaluation_limits": "Evaluation focused on two-speaker mixtures; only on clean speech mixtures from LibriSpeech; absent-speaker presence detection evaluated but still imperfect; metrics focused on SDRi, SI-SNRi, NSR, and speaker error rate (SpkER).",
      "deployment_limits": "Requires a reference speech utterance from target speaker; limited to single-channel scenarios; absent-speaker detection accuracy below 80%; not designed for silent speech interfaces.",
      "scope_limits": "Single-channel mixed speech; requires reference utterance; two-speaker mixtures in evaluation; no silent speech or command recognition tested.",
      "task": "target speaker extraction",
      "input_modality": "mixed speech plus reference utterance audio from target speaker",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "X-TaSNet achieves SDRi 14.7 dB, SI-SNRi 13.8 dB, NSR 4.3%, SpkER 4.6%; X-TaSNet-PIT achieves SISI-SNRi 14.5 dB and NER 72.4% (absent speaker detection). Voicefilter baseline SDRi 7.4 dB and SI-SNRi 6.4 dB with NSR 9.2%.",
      "evaluation_mode": "objective metrics including SI-SNR improvement, SDR improvement, Negative SI-SNR Rate (NSR), Speaker Error Rate (SpkER); subjective listening for SpkER; analysis of absent speaker detection via Negative Energy Rate (NER) and energy distribution.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "The paper proposes X-TaSNet, the first time-domain speaker extraction approach combined with a speaker verification model, introducing a distortion-based loss and alternating training scheme, along with new metrics and absent speaker scenario training.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "X-TaSNet integrates a pretrained GE2E speaker verification model with a Conv-TaSNet time-domain speech extraction network, adds a loss on distortion speakers and an alternating training scheme to improve the extraction of the correct target speaker and robustness to absent speakers.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "section_or_location": "2. Model",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.95,
          "statement": "The paper introduces new metrics Negative SI-SNRi Rate (NSR), Speaker Error Rate (SpkER), and Negative Energy Rate (NER) to measure speaker extraction correctness and absent speaker detection beyond traditional SDRi and SI-SNRi.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "section_or_location": "4. Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.9,
          "statement": "X-TaSNet achieves SDRi 14.7 dB, SI-SNRi 13.8 dB, NSR 4.3%, SpkER 4.6%, while X-TaSNet-PIT reaches SISI-SNRi 14.5 dB and NER 72.4% absent-speaker detection; compared to Voicefilter baseline with SDRi 7.4 dB, SI-SNRi 6.4 dB, NSR 9.2%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "section_or_location": "4. Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "Absent-speaker detection accuracy remains below 80%, and the approach requires a reference speech segment; evaluated only on mixtures of two different speakers from LibriSpeech clean subset; method constrained to speech extraction, not silent speech interfaces.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "section_or_location": "4. Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.85,
          "statement": "The model requires a reference utterance from the target speaker and currently operates on single-channel speech; absent-speaker robust extraction is improved but imperfect, limiting deployment; silent-speech and multimodal SSI are out of scope.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.9,
          "statement": "Evaluations conducted on mixtures of two speakers using LibriSpeech clean subset and voicefilter dataset; includes absent speaker conditions but no real-world noisy or multi-speaker scenarios beyond two-speaker.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network-x-tasnet-robust-and-accurate-time-domain-speaker-extraction-network.txt",
          "section_or_location": "4. Experiments",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_listening-to-sounds-of-silence-for-speech-denoising",
      "slug": "listening-to-sounds-of-silence-for-speech-denoising",
      "title": "Listening to Sounds of Silence for Speech Denoising",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ruilin Xu",
        "Rundi Wu",
        "Yuko Ishiwaka",
        "Carl Vondrick",
        "Changxi Zheng"
      ],
      "url": "https://nao-ki-mura.com/paper/listening-to-sounds-of-silence-for-speech-denoising",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2010.12013",
      "arxiv_url": "https://arxiv.org/abs/2010.12013",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:speech-audio",
        "task:speech-enhancement",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "expert_take_short": "Strong denoising work, not SSI.",
      "expert_take_long": "The full text makes the contribution clearer than the abstract alone: the value is not merely that silence exists, but that silent-interval supervision materially improves both interval detection and downstream denoising. Table 2 shows each proposed component matters, and Table 3 shows the method is competitive on VoiceBank-DEMAND despite being designed for harsher SNR ranges. That said, it remains microphone-based speech enhancement rather than silent-speech decoding.",
      "expert_true_value": "A strong speech-denoising paper with careful ablations and broad evaluation, but it is not a silent-speech interface paper.",
      "canon_before": "Speech denoisers typically treat speech regions directly and do not use naturally occurring pauses as explicit noise probes.",
      "delta_from_canon": "This paper turns silent-interval detection into a supervisory signal for a two-step denoising pipeline.",
      "position_in_field": "Adjacent speech-enhancement work outside SSI.",
      "practical_value": "Relevant as an audio-only denoising reference if one wants to exploit pauses, not as an SSI method.",
      "axes_moved": "noise_estimation; silence_supervision; robustness",
      "axes_unresolved": "performance in truly pause-free speech and SSI relevance remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "The method assumes speech with natural pauses and still requires accurate silent-interval detection to realize the full gain.",
      "evaluation_limits": "Real-world tests are qualitative because clean references are unavailable, and the benchmark comparison is centered on denoising corpora rather than SSI tasks.",
      "deployment_limits": "No articulatory sensing, silent communication, or SSI deployment claim is present.",
      "scope_limits": "Single-channel speech denoising only.",
      "task": "speech-enhancement",
      "input_modality": "mono speech audio with detected silent intervals",
      "sensor_hardware": "microphone",
      "body_site": "",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "On VoiceBank-DEMAND the model reports PESQ 3.16 and STOI 0.98; silent-interval detection reaches F1 0.869 / accuracy 0.918 on DEMAND and F1 0.807 / accuracy 0.873 on AudioSet.",
      "evaluation_mode": "multi-dataset denoising benchmark plus silent-interval detection metrics, ablations, VoiceBank-DEMAND comparison, and real-world tests",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract says the method leverages incidental silent intervals to learn automatic speech denoising from mono-channel audio.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_listening-to-sounds-of-silence-for-speech-denoising-listening-to-sounds-of-silence-for-speech-denoising.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_listening-to-sounds-of-silence-for-speech-denoising-listening-to-sounds-of-silence-for-speech-denoising.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.96,
          "statement": "Table 1 reports silent-interval detection at precision 0.876, recall 0.866, F1 0.869, and accuracy 0.918 on DEMAND, with similarly strong AudioSet results.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_listening-to-sounds-of-silence-for-speech-denoising-listening-to-sounds-of-silence-for-speech-denoising.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_listening-to-sounds-of-silence-for-speech-denoising-listening-to-sounds-of-silence-for-speech-denoising.txt",
          "section_or_location": "Table 1",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.96,
          "statement": "Table 3 reports the model at PESQ 3.16 and STOI 0.98 on VoiceBank-DEMAND, competitive with the strongest published baselines.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_listening-to-sounds-of-silence-for-speech-denoising-listening-to-sounds-of-silence-for-speech-denoising.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_listening-to-sounds-of-silence-for-speech-denoising-listening-to-sounds-of-silence-for-speech-denoising.txt",
          "section_or_location": "Table 3",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.92,
          "statement": "Section 4.6 says quantitative real-world evaluation is difficult because clean references are unavailable, so those tests are qualitative.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_listening-to-sounds-of-silence-for-speech-denoising-listening-to-sounds-of-silence-for-speech-denoising.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_listening-to-sounds-of-silence-for-speech-denoising-listening-to-sounds-of-silence-for-speech-denoising.txt",
          "section_or_location": "4.6 Tests on real-world data",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching",
      "slug": "discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching",
      "title": "Discriminative Sounding Objects Localization via Self-supervised Audiovisual Matching",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Di Hu",
        "Rui Qian",
        "Minyue Jiang",
        "Xiao Tan",
        "Shilei Wen",
        "Errui Ding",
        "Weiyao Lin",
        "Dejing Dou"
      ],
      "url": "https://nao-ki-mura.com/paper/discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2010.05466",
      "arxiv_url": "https://arxiv.org/abs/2010.05466",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:multimodal",
        "output:labels"
      ],
      "expert_take_short": "Technically solid self-supervised class-aware audiovisual sounding object localization, but outside the core SSI domain.",
      "expert_take_long": "This work is a methodologically strong demonstration of class-aware audiovisual object localization through a self-supervised two-stage learning approach. By first aggregating single-source localization maps into an object dictionary, and then leveraging audiovisual consistency to discriminate sounding vs silent objects in cocktail-party scenes, it advances prior sound localization methods that were not class-aware or capable of silent-object suppression. However, the approach depends on curated audiovisual musical datasets and requires prior splitting of single- versus multi-source data, limiting end-to-end deployment. Moreover, while the problem setting aligns with auditory scene analysis, it is not an SSI (silent speech interface) paper proper and should be considered as contributing to adjacent multimodal scene understanding. The novel CIoU and NSA metrics provide meaningful structured evaluation for this complex task. Overall, the work is valuable for audiovisual multimodal perception research but overclaims on silent speech applicability should be avoided.",
      "expert_true_value": "The core contribution is class-aware sounding object localization with silent object filtering enabled by a two-stage representation-learning framework and audiovisual category distribution alignment — not silent-speech interface or interaction.",
      "canon_before": "Prior audiovisual localization methods typically find active sound source regions but cannot discriminate which object class is sounding in mixed, cocktail-party scenes.",
      "delta_from_canon": "Shift from generic sounding area detection to class-aware sounding object localization and silent-object filtering using an object dictionary and audiovisual category distribution alignment.",
      "position_in_field": "Adjacent audiovisual multimodal perception reference, not core silent speech interface benchmark.",
      "practical_value": "Improves audiovisual scene understanding and localization in mixed sound environments, helpful for machine perception research but not for silent speech interaction systems.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Approach requires rough prior partitioning of single-source vs multi-source videos, limiting end-to-end applicability.",
      "axes_regressed": "",
      "technical_limits": "Not SSI-specific; depends on annotated audiovisual instrument datasets; requires separate single/multi-source dataset partitioning; no reported real-time or mobile capabilities.",
      "evaluation_limits": "Evaluations limited to musical instrument localization datasets, synthetic and realistic audiovisual clips; no specific silent speech or speech-domain tasks tested.",
      "deployment_limits": "Method relies on curated musical/instrument datasets with bounding box annotations and needs rough scenario partitioning; not designed for real-time or mobile deployment, nor SSI-specific interaction tasks.",
      "scope_limits": "Localization of sounding objects in musical instrument and AudioSet-instrument audiovisual datasets including synthetic and realistic cocktail-party videos.",
      "task": "class-aware sounding object localization",
      "input_modality": "audio plus video in cocktail-party scenes",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "On MUSIC-synthetic, CIoU/AUC/NSA are 32.3/23.5/98.5; on MUSIC-duet 30.2/22.1/83.1; on AudioSet-instrument-multi 48.7/29.7/56.8 (Table 2). Evaluation metrics include IoU, AUC for single-source, plus novel class-aware IoU and silent object area suppression metrics for cocktail-party localization.",
      "evaluation_mode": "Quantitative localization metrics on single-source and cocktail-party audiovisual video datasets; includes novel class-aware IoU and silent-object filtering metrics.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper proposes self-supervised class-aware sounding object localization that can distinguish sounding from silent objects in mixed scenes.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "This paper proposes a two-stage learning framework: first learning object representations from single-source scenes, then performing class-aware audiovisual matching for discriminative sounding object localization in cocktail-party scenarios.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "They introduced novel Class-aware IoU (CIoU) and No-Sounding-Area (NSA) metrics to evaluate localization of multiple sounding objects and silent object filtering.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "section_or_location": "Datasets",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On MUSIC-synthetic, the method achieves CIoU/AUC/NSA of 32.3/23.5/98.5; on MUSIC-duet 30.2/22.1/83.1; on AudioSet-instrument-multi 48.7/29.7/56.8.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "section_or_location": "4.4 Multiple sounding objects localization",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The approach requires a rough prior partitioning of the dataset into single-source and multi-source scenarios, limiting an end-to-end deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "section_or_location": "5 Discussion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The evaluation is limited to curated musical instrument audiovisual datasets (MUSIC, AudioSet-instrument) and synthetic/realistic cocktail-party videos, without testing speech-related or SSI tasks.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "section_or_location": "4.1 Datasets",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "The task and method target audiovisual sounding object localization, not silent speech interaction, relying on curated datasets and require scenario labeling for single-source vs multi-source videos.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "section_or_location": "1 Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The paper uses multimodal input: audio and video (visual frames). Output is localization maps with class labels indicating sounding objects.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_discriminative-sounding-objects-localization-via-self-supervised-audiovisual-matching-discriminative-sounding-objects-localization-via-self-supervised-audiovisual-mat.txt",
          "section_or_location": "3 The proposed method",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_digital-voicing-of-silent-speech",
      "slug": "digital-voicing-of-silent-speech",
      "title": "Digital Voicing of Silent Speech",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/digital-voicing-of-silent-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2010.02960",
      "arxiv_url": "https://arxiv.org/abs/2010.02960",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:throat",
        "modality:emg",
        "task:speech-reconstruction",
        "output:speech-audio",
        "deployment:speaker-dependent",
        "deployment:wearable",
        "evaluation:quantitative",
        "evaluation:unseen-words"
      ],
      "expert_take_short": "Core EMG SSI paper with real gains from target transfer.",
      "expert_take_long": "The full text backs a strong SSI claim: the target-transfer pipeline is the key step that makes silent EMG viable for speech reconstruction instead of merely hoping a model trained on vocalized EMG will transfer. The closed-vocabulary result is genuinely strong, and the open-vocabulary numbers remain difficult but materially better than baseline. The main remaining weakness is generalization: the setup is speaker-dependent, performance in open vocabulary is still rough, and the system depends on relatively heavy data collection and alignment machinery.",
      "expert_true_value": "Strong EMG SSI paper showing that silent EMG can produce intelligible speech if training explicitly handles the mismatch between silent and vocalized articulations.",
      "canon_before": "Prior EMG-to-speech work largely trained on vocalized EMG and transferred poorly to silent EMG because aligned speech targets were missing.",
      "delta_from_canon": "Introduces target transfer, CCA alignment, and predicted-audio refinement so silent EMG can train a speech generator directly.",
      "position_in_field": "Core EMG silent-speech reconstruction paper with clear methodological relevance to SSI.",
      "practical_value": "Important evidence that alignment strategy, not just model size, is the bottleneck in silent-EMG-to-speech generation.",
      "axes_moved": "system_design; problem_reframing; evaluation",
      "axes_unresolved": "Cross-speaker transfer; lighter sensors; better open-vocabulary intelligibility",
      "axes_regressed": "",
      "technical_limits": "Speaker-dependent setup with substantial data collection and still-high WER in open-vocabulary conditions.",
      "evaluation_limits": "Open-vocabulary outputs remain far from production quality, and all results come from the authors' data collection setup.",
      "deployment_limits": "Requires facial EMG instrumentation and a speaker-specific training pipeline.",
      "scope_limits": "Silent EMG to speech reconstruction only.",
      "task": "speech-reconstruction",
      "input_modality": "surface emg",
      "sensor_hardware": "facial EMG electrodes",
      "body_site": "face; jaw; throat",
      "output_type": "speech-audio",
      "vocabulary_type": "closed vocabulary dates/times plus open vocabulary book text",
      "vocabulary_size": "9828 words in open-vocabulary condition",
      "metrics": "Closed-vocabulary human WER reaches 3.6% with a 94% relative error reduction from the strongest baseline; open-vocabulary human WER drops from 95.1% to 74.8%; automatic open-vocabulary WER drops from 91.2% to 68.0%",
      "evaluation_mode": "human transcription WER, automatic ASR WER, and ablations on data size and electrode subsets",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The paper studies digital voicing of silent speech from facial EMG and proposes target transfer from vocalized recordings so silent EMG can train a speech generator.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_digital-voicing-of-silent-speech-digital-voicing-of-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_digital-voicing-of-silent-speech-digital-voicing-of-silent-speech.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.98,
          "statement": "Section 3.2 explains that audio target transfer aligns silent and vocalized EMG so silent EMG can receive speech targets, which is the core step that differentiates the method from direct transfer baselines.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_digital-voicing-of-silent-speech-digital-voicing-of-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_digital-voicing-of-silent-speech-digital-voicing-of-silent-speech.txt",
          "section_or_location": "3.2   Audio Target Transfer",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.98,
          "statement": "The closed-vocabulary human evaluation reports 3.6% WER for the full model, described as a 94% relative error reduction from the strongest baseline.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_digital-voicing-of-silent-speech-digital-voicing-of-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_digital-voicing-of-silent-speech-digital-voicing-of-silent-speech.txt",
          "section_or_location": "4.1    Closed Vocabulary Condition",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_end-to-end-speaker-dependent-voice-activity-detection",
      "slug": "end-to-end-speaker-dependent-voice-activity-detection",
      "title": "End-to-End Speaker-Dependent Voice Activity Detection",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/end-to-end-speaker-dependent-voice-activity-detection",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2009.09906",
      "arxiv_url": "https://arxiv.org/abs/2009.09906",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:labels",
        "task:audio-classification",
        "deployment:real-time",
        "deployment:speaker-dependent",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Strong target-speaker VAD paper, not SSI.",
      "expert_take_long": "The full text supports the claimed speech-processing advance: end-to-end speaker-aware VAD improves frame accuracy and F-score over the two-stage baseline and can run online with negligible latency. The main caution is scope. This is target-speaker activity detection for audible speech, not silent-speech sensing, and even inside its own task the paper shows segment-level fragmentation problems that need feature binning/post-processing to stabilize boundaries.",
      "expert_true_value": "Competent target-speaker VAD paper, but not an SSI contribution.",
      "canon_before": "Speaker-dependent VAD was usually implemented as a two-stage VAD plus speaker-verification cascade with added latency.",
      "delta_from_canon": "Moves target-speaker conditioning inside the model and shows large frame-level gains from feature binning and end-to-end training.",
      "position_in_field": "Speech-processing paper adjacent to SSI only through target-speaker filtering, not silent speech.",
      "practical_value": "Useful reference for low-latency target-speaker VAD front ends in speech systems.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "More robust segment boundaries; broader speaker generalization",
      "axes_regressed": "",
      "technical_limits": "Speaker-dependent task only and still vulnerable to fragmentation without feature binning or post-processing.",
      "evaluation_limits": "Segment-level quality lags frame-level gains, so the best headline metrics overstate temporal cleanliness.",
      "deployment_limits": "Useful for speech pipelines, but unrelated to silent-speech interface deployment.",
      "scope_limits": "Target-speaker voice activity detection only.",
      "task": "audio-classification",
      "input_modality": "speech audio",
      "sensor_hardware": "microphone",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Best LSTM SDVAD+binning+post reaches 94.62% ACC and 93.47% F-score; segment-level J-VAD for LSTM SDVAD+binning is 73.66% versus 76.68% for the LSTM VAD/SV baseline due to fragmentation effects",
      "evaluation_mode": "frame-level ACC/F-score and segment-level J-VAD analysis",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.98,
          "statement": "The paper defines speaker-dependent VAD as detecting only the target speaker and claims the end-to-end approach can perform online predictions directly with negligible latency.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_end-to-end-speaker-dependent-voice-activity-detection-end-to-end-speaker-dependent-voice-activity-detection.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_end-to-end-speaker-dependent-voice-activity-detection-end-to-end-speaker-dependent-voice-activity-detection.txt",
          "section_or_location": "5. Conclusion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.98,
          "statement": "Table 1 reports that LSTM SDVAD+binning+post reaches 94.62% ACC and 93.47% F-score, substantially above the listed baselines.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_end-to-end-speaker-dependent-voice-activity-detection-end-to-end-speaker-dependent-voice-activity-detection.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_end-to-end-speaker-dependent-voice-activity-detection-end-to-end-speaker-dependent-voice-activity-detection.txt",
          "section_or_location": "Table 1: ACC(%) and F-score(%) of different systems. VAD / SV",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.96,
          "statement": "The segment-level evaluation explains that the original SDVAD system suffers from fragmentation, with poor border precision until feature binning is applied.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_end-to-end-speaker-dependent-voice-activity-detection-end-to-end-speaker-dependent-voice-activity-detection.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_end-to-end-speaker-dependent-voice-activity-detection-end-to-end-speaker-dependent-voice-activity-detection.txt",
          "section_or_location": "4.4 Segment level Evaluation",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception",
      "slug": "a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception",
      "title": "A comparison of oscillatory characteristics in covert speech and speech perception",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2009.02816",
      "arxiv_url": "https://arxiv.org/abs/2009.02816",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "modality:eeg",
        "output:labels",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Strong covert-speech EEG analysis, not an SSI system.",
      "expert_take_long": "The full text supports a narrow but meaningful claim: this paper maps how covert speech and speech perception differ in EEG oscillatory structure, with covert speech favoring higher-frequency activity and perception showing stronger delta/theta involvement. The most useful result for SSI-adjacent BCI work is the reported relationship between perception theta and covert-speech gamma, which motivates passive-training ideas for future covert-speech decoders. But the study remains a small-participant analysis paper with no interface, no decoding benchmark, and a lexicon limited to short words.",
      "expert_true_value": "Useful foundational EEG evidence for covert-speech BCI modeling, but not itself an SSI system or decoder.",
      "canon_before": "Covert-speech BCI studies often borrowed intuition from speech perception without directly characterizing how their oscillatory roles differ.",
      "delta_from_canon": "Adds a direct EEG comparison showing covert speech favors higher-frequency activity while speech perception shows stronger delta/theta structure and cross-task theta-gamma coupling.",
      "position_in_field": "Foundational covert-speech neuroscience paper adjacent to SSI rather than a deployable interface study.",
      "practical_value": "Helps future covert-speech BCI work decide which oscillatory bands may transfer from speech perception signals.",
      "axes_moved": "evaluation; problem_reframing",
      "axes_unresolved": "Larger lexicon; stronger decoding link; real BCI transfer",
      "axes_regressed": "",
      "technical_limits": "Eight-participant EEG study with a small lexicon; no real SSI decoder, no user study, and no deployment experiment.",
      "evaluation_limits": "Participant variability is visible in the classification table, and the limited lexicon may make some cross-task rhythmic similarities partly task-driven.",
      "deployment_limits": "Would need a full covert-speech BCI stack and richer lexical coverage before any interface use.",
      "scope_limits": "Oscillatory analysis of covert speech versus speech perception only.",
      "task": "covert speech analysis",
      "input_modality": "eeg",
      "sensor_hardware": "EEG cap",
      "body_site": "brain",
      "output_type": "labels",
      "vocabulary_type": "1-2 syllable spoken words",
      "vocabulary_size": "small fixed lexicon",
      "metrics": "Per-participant 10-fold SVM precision/recall/F1; Wilcoxon tests show stronger delta/theta involvement for perception (p<0.01 and p<0.0001) and stronger low-gamma involvement for covert speech (p<0.05); significant theta-gamma PAC reported in the 200-500 ms window",
      "evaluation_mode": "10-fold cross-validated classification with Wilcoxon frequency-band statistics and PAC analysis",
      "evidence": [
        {
          "claim_type": "actual_novelty",
          "confidence": 0.98,
          "statement": "The study directly compares covert speech and speech perception with EEG and argues that covert speech favors higher-frequency activity while perception shows stronger lower-frequency structure.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception-a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-percepti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception-a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-percepti.txt",
          "section_or_location": "6     Conclusion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.96,
          "statement": "The classification table reports per-participant 10-fold SVM precision, recall, and F1 scores, and the text states that speech perception engages significantly more delta and theta while covert speech shows more low-gamma involvement.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception-a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-percepti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception-a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-percepti.txt",
          "section_or_location": "Table 1: Classification scores for each classification type and for each participant.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.95,
          "statement": "The paper notes that the lexicon varied only between one and two syllables spoken at the same rate, and calls for richer lexicons and sentential forms in future work.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception-a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-percepti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-perception-a-comparison-of-oscillatory-characteristics-in-covert-speech-and-speech-percepti.txt",
          "section_or_location": "5.4   General discussion and Limitations",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_silent-speech-interfaces-for-speech-restoration-a-review",
      "slug": "silent-speech-interfaces-for-speech-restoration-a-review",
      "title": "Silent Speech Interfaces for Speech Restoration: A Review",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Jose A. Gonzalez-Lopez",
        "Alejandro Gomez-Alanis",
        "Juan M. Martin-Donas",
        "Jose L. Perez-Cordoba",
        "Angel M. Gomez"
      ],
      "url": "https://nao-ki-mura.com/paper/silent-speech-interfaces-for-speech-restoration-a-review",
      "doi": "10.1109/ACCESS.2020.3026579",
      "doi_url": "https://doi.org/10.1109/ACCESS.2020.3026579",
      "arxiv_id": "2009.02110",
      "arxiv_url": "https://arxiv.org/abs/2009.02110",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:multimodal",
        "task:survey"
      ],
      "expert_take_short": "Core SSI survey with concrete deployment constraints.",
      "expert_take_long": "The full text is strongest where it stops being a catalog and starts naming bottlenecks. The latency discussion is unusually concrete, arguing that 50 ms is ideal and 100 ms may still be acceptable, which sharply favors direct synthesis over slower ASR-TTS cascades. The later challenge section also makes clear that SSI progress is held back less by model fashion than by tiny patient datasets, weak public data availability, and difficult real-world sensor placement.",
      "expert_true_value": "A strong field survey that is most useful for identifying what blocks SSI deployment: latency, scarce patient data, and modality-specific tradeoffs.",
      "canon_before": "SSI literature was fragmented across modalities and clinical populations.",
      "delta_from_canon": "This review consolidates the sensing landscape and makes latency, patient data scarcity, and deployment practicality explicit cross-cutting bottlenecks.",
      "position_in_field": "Core SSI background review for speech-restoration framing.",
      "practical_value": "Useful as a top-level map of SSI modalities and system bottlenecks.",
      "axes_moved": "field_mapping; latency_requirements; modality_tradeoffs",
      "axes_unresolved": "patient-scale data, robust public datasets, and clinically viable deployment remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "It is a review and therefore inherits the uneven quality and comparability of the literature it summarizes.",
      "evaluation_limits": "The paper synthesizes prior results rather than running a unified benchmark.",
      "deployment_limits": "It identifies many deployment obstacles but does not itself resolve them.",
      "scope_limits": "Speech-restoration SSI review, not a new system.",
      "task": "survey",
      "input_modality": "non-acoustic biosignals and articulator sensing",
      "sensor_hardware": "multimodal biosignal sensors",
      "body_site": "brain; face; lip; oral-cavity; throat; tongue",
      "output_type": "",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "The review cites an ideal SSI latency of about 50 ms, acceptable latency up to 100 ms, and disruptive delayed-feedback effects by 200 ms.",
      "evaluation_mode": "comparative review of sensing modalities, restoration scenarios, latency requirements, and dataset constraints",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract frames the paper as a review of SSI research for speech restoration using non-acoustic biosignals and articulator sensing.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_silent-speech-interfaces-for-speech-restoration-a-review-silent-speech-interfaces-for-speech-restoration-a-review.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_silent-speech-interfaces-for-speech-restoration-a-review-silent-speech-interfaces-for-speech-restoration-a-review.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.95,
          "statement": "The latency discussion states that 50 ms is ideal for SSI, values up to 100 ms may still be acceptable, and larger delays disrupt communication.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_silent-speech-interfaces-for-speech-restoration-a-review-silent-speech-interfaces-for-speech-restoration-a-review.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_silent-speech-interfaces-for-speech-restoration-a-review-silent-speech-interfaces-for-speech-restoration-a-review.txt",
          "section_or_location": "IV. SENSING TECHNIQUES",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.95,
          "statement": "The challenges section says SSI studies mostly rely on small datasets recorded by different groups and that public datasets remain a major bottleneck.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_silent-speech-interfaces-for-speech-restoration-a-review-silent-speech-interfaces-for-speech-restoration-a-review.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_silent-speech-interfaces-for-speech-restoration-a-review-silent-speech-interfaces-for-speech-restoration-a-review.txt",
          "section_or_location": "V. C URRENT CHALLENGES AND FUTURE RESEARCH",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.93,
          "statement": "The review explicitly covers brain activity sensors, muscle activity sensors, and articulator tracking as alternative SSI sensing routes.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_silent-speech-interfaces-for-speech-restoration-a-review-silent-speech-interfaces-for-speech-restoration-a-review.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_silent-speech-interfaces-for-speech-restoration-a-review-silent-speech-interfaces-for-speech-restoration-a-review.txt",
          "section_or_location": "IV. S ENSING TECHNIQUES",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation",
      "slug": "an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation",
      "title": "An Overview of Deep-Learning-Based Audio-Visual Speech Enhancement and Separation",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2008.09586",
      "arxiv_url": "https://arxiv.org/abs/2008.09586",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:acoustic",
        "modality:microphone",
        "modality:multimodal",
        "modality:video",
        "task:survey",
        "output:speech-audio"
      ],
      "expert_take_short": "Strong AV speech survey, not an SSI system paper.",
      "expert_take_long": "The full text is strong as a survey: it synthesizes how AV speech enhancement and separation systems are built, where visual input helps most, what fusion choices dominate, and why evaluation remains hard to compare across papers. Its practical value is field-mapping rather than new algorithmic evidence. For SSI work, it is adjacent because it touches silent-video speech reconstruction and multimodal speech processing, but it does not introduce a new silent-speech interface or benchmark.",
      "expert_true_value": "Good map of AV speech enhancement/separation design choices and evaluation gaps, but not a primary SSI contribution.",
      "canon_before": "AV speech enhancement and separation knowledge was dispersed across modality choices, fusion strategies, datasets, and evaluation practices.",
      "delta_from_canon": "Organizes the area into acoustic features, visual features, deep learning methods, fusion techniques, training targets, datasets, and evaluation gaps.",
      "position_in_field": "Survey paper adjacent to SSI via audio-visual speech processing rather than silent-speech interaction itself.",
      "practical_value": "Useful orientation document for choosing AV speech enhancement/separation components and understanding current evaluation weaknesses.",
      "axes_moved": "evaluation; problem_reframing",
      "axes_unresolved": "Standardized AV evaluation; low-resource deployment; robust fusion under real-world conditions",
      "axes_regressed": "",
      "technical_limits": "No new benchmark, no unified re-evaluation, and no direct empirical comparison across systems.",
      "evaluation_limits": "The paper explicitly states that lack of standardized AV evaluation makes broad performance ranking hard to interpret.",
      "deployment_limits": "Not a deployable system; conclusions remain dependent on the underlying papers it surveys.",
      "scope_limits": "Survey of AV speech enhancement and separation, plus adjacent silent-video and non-speech AV source separation work.",
      "task": "survey",
      "input_modality": "audio + video",
      "sensor_hardware": "microphone + camera",
      "body_site": "face; lip",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Survey identifies PESQ, STOI/ESTOI, SDR/SI-SDR, and WER as common metrics, while noting the lack of standardized audio-visual evaluation procedures",
      "evaluation_mode": "literature review",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.98,
          "statement": "The paper presents an overview of deep-learning-based audio-visual speech enhancement and separation systems, including features, fusion methods, training targets, datasets, and evaluation methods.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation-an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separatio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation-an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separatio.txt",
          "section_or_location": "XII. C ONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.95,
          "statement": "The conclusion states that visual information benefits both enhancement and separation, and that AV systems can outperform audio-only systems especially at low SNR or when source permutation is an issue.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation-an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separatio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation-an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separatio.txt",
          "section_or_location": "XII. C ONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.97,
          "statement": "The paper explicitly says that fair comparison is hard because of application-specific constraints and the lack of standardized audio-visual evaluation procedures.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation-an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separatio.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separation-an-overview-of-deep-learning-based-audio-visual-speech-enhancement-and-separatio.txt",
          "section_or_location": "XII. C ONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_citisen-a-deep-learning-based-speech-signal-processing-mobile-application",
      "slug": "citisen-a-deep-learning-based-speech-signal-processing-mobile-application",
      "title": "CITISEN: A Deep Learning-Based Speech Signal-Processing Mobile Application",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/citisen-a-deep-learning-based-speech-signal-processing-mobile-application",
      "doi": "10.1109/ACCESS.2022.3153469",
      "doi_url": "https://doi.org/10.1109/ACCESS.2022.3153469",
      "arxiv_id": "2008.09264",
      "arxiv_url": "https://arxiv.org/abs/2008.09264",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "modality:microphone",
        "task:speech-enhancement",
        "output:speech-audio",
        "deployment:mobile-suitable",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Strong mobile speech-processing app paper, not SSI.",
      "expert_take_long": "The full text supports a real integration contribution: CITISEN exposes speech enhancement, model adaptation, and background-noise conversion through a mobile application rather than only as isolated models. The results are meaningful for speech-processing deployment, especially the consistent STOI/PESQ gains from adaptation and the >90% BNC scene-accuracy summary. But the scope is audible speech enhancement and noise conversion, not silent-speech sensing or reconstruction, so it should not be presented as an SSI advance.",
      "expert_true_value": "Solid mobile speech-processing integration paper, but it is not an SSI contribution.",
      "canon_before": "Speech-enhancement work often reported model gains without integrating enhancement, adaptation, and controllable background conversion into a user-facing mobile workflow.",
      "delta_from_canon": "Packages enhancement, personalized adaptation, and background-noise conversion into a mobile app backed by cloud inference.",
      "position_in_field": "Speech-processing mobile application adjacent to SSI only through assistive speech enhancement.",
      "practical_value": "Useful as a mobile integration reference for speech enhancement and noise-style conversion workflows.",
      "axes_moved": "system_design; deployment; evaluation",
      "axes_unresolved": "On-device efficiency; broader speech domains; stronger robustness under unseen real-world noise",
      "axes_regressed": "",
      "technical_limits": "Cloud-backed inference and task-specific evaluation mean the app is not a silent-speech or low-resource on-device solution.",
      "evaluation_limits": "Metrics are speech-processing centric and do not establish broad real-world robustness beyond the tested SNR and scene settings.",
      "deployment_limits": "Demonstrated as a mobile speech app, but not as a silent-speech interface and not fully on-device.",
      "scope_limits": "Audible speech enhancement, adaptation, and noise conversion only.",
      "task": "speech-enhancement",
      "input_modality": "acoustic",
      "sensor_hardware": "microphone",
      "body_site": "",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Model adaptation improved STOI by 5.06%, 2.94%, and 5.84% and PESQ by 12.48%, 3.32%, and 11.24% for MA(N), MA(S), and MA(N+S) over the FCN baseline; machine-evaluation summary reports BNC accuracy above 90% with CCR dropping when enhanced speech replaces clean speech",
      "evaluation_mode": "objective speech metrics, human listening tests, acoustic-scene classification, and ASR-based machine evaluation",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.98,
          "statement": "CITISEN is presented as a mobile speech-signal-processing application that performs speech enhancement, model adaptation, and background-noise conversion.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_citisen-a-deep-learning-based-speech-signal-processing-mobile-application-citisen-a-deep-learning-based-speech-signal-processing-mobile-application.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_citisen-a-deep-learning-based-speech-signal-processing-mobile-application-citisen-a-deep-learning-based-speech-signal-processing-mobile-application.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "The results section reports that MA(N), MA(S), and MA(N+S) improve STOI by 5.06%, 2.94%, and 5.84% and PESQ by 12.48%, 3.32%, and 11.24% over the baseline enhancement model.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_citisen-a-deep-learning-based-speech-signal-processing-mobile-application-citisen-a-deep-learning-based-speech-signal-processing-mobile-application.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_citisen-a-deep-learning-based-speech-signal-processing-mobile-application-citisen-a-deep-learning-based-speech-signal-processing-mobile-application.txt",
          "section_or_location": "TABLE 6. Average STOI and PESQ scores for different SE models over -2, 0,",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.95,
          "statement": "The conclusion states that CITISEN implements the three functions on mobile devices, but the system is a speech-processing utility rather than a silent-speech interface.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_citisen-a-deep-learning-based-speech-signal-processing-mobile-application-citisen-a-deep-learning-based-speech-signal-processing-mobile-application.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_citisen-a-deep-learning-based-speech-signal-processing-mobile-application-citisen-a-deep-learning-based-speech-signal-processing-mobile-application.txt",
          "section_or_location": "V. CONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_foley-music-learning-to-generate-music-from-videos",
      "slug": "foley-music-learning-to-generate-music-from-videos",
      "title": "Foley Music: Learning to Generate Music from Videos",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "''",
        "''",
        "''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/foley-music-learning-to-generate-music-from-videos",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2007.10984",
      "arxiv_url": "https://arxiv.org/abs/2007.10984",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:multimodal",
        "modality:video",
        "output:audio",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Strong video-to-music paper, not SSI.",
      "expert_take_long": "The full text supports a credible music-generation contribution: keypoints plus MIDI make synchronization and structure easier to learn than direct waveform targets, and the paper wins both preference studies and automatic diversity metrics. But none of that is silent-speech interface work. It belongs in a broader multimodal archive only if non-SSI distractors are intentionally retained and clearly labeled as such.",
      "expert_true_value": "Strong multimodal music-generation paper, but outside SSI scope.",
      "canon_before": "Video-to-sound generation often worked in waveform or spectrogram space and struggled to align long-term musical structure with body motion.",
      "delta_from_canon": "Uses body keypoints and MIDI as intermediate representations, turning video-to-music generation into a motion-to-MIDI translation problem.",
      "position_in_field": "Audio-visual generation paper that can distract an SSI corpus if not explicitly labeled out-of-scope.",
      "practical_value": "Useful reference for structured video-to-audio generation where symbolic intermediate representations matter.",
      "axes_moved": "system_design; problem_reframing; evaluation",
      "axes_unresolved": "Broader video domains; fully learned waveform synthesis; real-world robustness",
      "axes_regressed": "",
      "technical_limits": "Scope is instrument-performance video; waveform realism still depends on an external synthesizer and future neural synthesis work.",
      "evaluation_limits": "Human studies are preference-based and confined to the tested instrument/video distributions.",
      "deployment_limits": "No SSI deployment path; this is a multimedia generation system.",
      "scope_limits": "Video-to-music generation only.",
      "task": "",
      "input_modality": "video",
      "sensor_hardware": "camera",
      "body_site": "",
      "output_type": "audio",
      "vocabulary_type": "MIDI event vocabulary",
      "vocabulary_size": "variable MIDI sequence length",
      "metrics": "Human preference rates in Table 1 favor the method in every instrument category, ranging from 56% to 72%; real-vs-fake success reaches 38% versus 8-12% for baselines; NDB is 20 versus 25-33 for baselines",
      "evaluation_mode": "human preference studies, real-vs-fake listening study, NDB diversity metric, and NLL ablations",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "The paper introduces Foley Music, which generates plausible music from silent videos of people playing instruments by translating body motion into MIDI and then into audio.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_foley-music-learning-to-generate-music-from-videos-foley-music-learning-to-generate-music-from-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_foley-music-learning-to-generate-music-from-videos-foley-music-learning-to-generate-music-from-videos.txt",
          "section_or_location": "Abstract. In this paper, we introduce Foley Music, a system that can",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "The human comparison study in Table 1 shows the method wins every instrument category, with preference rates ranging from 56% to 72% across the listed instruments.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_foley-music-learning-to-generate-music-from-videos-foley-music-learning-to-generate-music-from-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_foley-music-learning-to-generate-music-from-videos-foley-music-learning-to-generate-music-from-videos.txt",
          "section_or_location": "Table 1. Human evaluation on model comparisons.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "Automatic evaluation in Table 3 reports NDB 20 for the proposed method versus 25 to 33 for the compared baselines, indicating more diverse generated sound.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_foley-music-learning-to-generate-music-from-videos-foley-music-learning-to-generate-music-from-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_foley-music-learning-to-generate-music-from-videos-foley-music-learning-to-generate-music-from-videos.txt",
          "section_or_location": "Table 3. Automatic metrics for different models. For NDB, lower is better.",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_learning-frame-level-attention-for-environmental-sound-classification",
      "slug": "learning-frame-level-attention-for-environmental-sound-classification",
      "title": "Learning Frame Level Attention for Environmental Sound Classification",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zhichao Zhang",
        "Shugong Xu",
        "Shunqing Zhang",
        "Tianhao Qiao",
        "Shan Cao"
      ],
      "url": "https://nao-ki-mura.com/paper/learning-frame-level-attention-for-environmental-sound-classification",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2007.07241",
      "arxiv_url": "https://arxiv.org/abs/2007.07241",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "output:labels",
        "task:audio-classification",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "expert_take_short": "Strong ESC paper, but outside SSI.",
      "expert_take_long": "The full text supports a tight reading: ACRNN improves environmental sound classification by selectively upweighting informative frames instead of inventing a new sensing modality. Table 4 shows the gains are real but modest, and Table 2 matters because the model keeps almost the same compute as the non-attention CRNN while greatly undercutting PiczakCNN. That makes it a respectable adjacent benchmark, not a silent-speech contribution.",
      "expert_true_value": "This is a compact audio-classification paper, not an SSI system; its real contribution is efficient attention over salient sound frames.",
      "canon_before": "Environmental sound classifiers already used CNN or CRNN backbones, but they spent capacity on silent or irrelevant frames.",
      "delta_from_canon": "ACRNN adds frame-level attention and shows the best gains when attention is applied at the recurrent output layer.",
      "position_in_field": "Adjacent audio benchmark outside SSI proper.",
      "practical_value": "Useful only as a reference for selective temporal weighting in audio models, not as an SSI baseline.",
      "axes_moved": "attention; compute_efficiency; temporal_salience",
      "axes_unresolved": "noise robustness and SSI relevance remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "The method is limited to clip-level environmental sound classification and does not model noise robustness beyond the reported datasets.",
      "evaluation_limits": "Results are only on ESC-10 and ESC-50, and the conclusion explicitly says noise robustness was not quantified.",
      "deployment_limits": "No SSI, wearable, or real-time deployment story is present.",
      "scope_limits": "Environmental sound classification only.",
      "task": "audio-classification",
      "input_modality": "environmental audio spectrograms",
      "sensor_hardware": "microphone",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Best configuration reaches 93.7% on ESC-10 and 86.1% on ESC-50, while Table 2 keeps the model at 3.81M parameters and 9.18M FLOPs versus PiczakCNN at 31.53M and 63.27M.",
      "evaluation_mode": "5-fold ESC-10 and ESC-50 classification benchmark with ablations on attention placement and scaling",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract states that the model uses frame-level attention to focus on semantically relevant and salient frames in ESC and is evaluated on ESC-50 and ESC-10.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_learning-frame-level-attention-for-environmental-sound-classification-learning-frame-level-attention-for-environmental-sound-classification.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_learning-frame-level-attention-for-environmental-sound-classification-learning-frame-level-attention-for-environmental-sound-classification.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "Table 4 and the surrounding discussion show that the key design choice is where attention is applied, with the recurrent output layer l10 giving the best results.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_learning-frame-level-attention-for-environmental-sound-classification-learning-frame-level-attention-for-environmental-sound-classification.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_learning-frame-level-attention-for-environmental-sound-classification-learning-frame-level-attention-for-environmental-sound-classification.txt",
          "section_or_location": "Table 4",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "Table 4 reports the best configuration at 93.7% on ESC-10 and 86.1% on ESC-50, and Figure 4 labels the ESC-50 average accuracy as 86.1%.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_learning-frame-level-attention-for-environmental-sound-classification-learning-frame-level-attention-for-environmental-sound-classification.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_learning-frame-level-attention-for-environmental-sound-classification-learning-frame-level-attention-for-environmental-sound-classification.txt",
          "section_or_location": "Table 4",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.93,
          "statement": "The conclusion explicitly says robustness to noise was not quantified and is left for future work.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_learning-frame-level-attention-for-environmental-sound-classification-learning-frame-level-attention-for-environmental-sound-classification.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_learning-frame-level-attention-for-environmental-sound-classification-learning-frame-level-attention-for-environmental-sound-classification.txt",
          "section_or_location": "5. Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images",
      "slug": "ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images",
      "title": "Ultra2Speech -- A Deep Learning Framework for Formant Frequency Estimation and Tracking from Ultrasound Tongue Images",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Pramit Saha",
        "Yadong Liu",
        "Bryan Gick",
        "Sidney Fels"
      ],
      "url": "https://nao-ki-mura.com/paper/ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2006.16367",
      "arxiv_url": "https://arxiv.org/abs/2006.16367",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:oral-cavity",
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Strong ultrasound SSI paper with unusually clear quantitative gains.",
      "expert_take_long": "The full text shows this is more than a formant-regression curiosity. U2F cleanly beats the Conv-BiLSTM and plain 3D CNN baselines, and the joint f1-f2 result at 99.96 mean R2 is far above the recurrent baseline. The interesting systems insight is that hybrid spatial-temporal blocks plus channel shuffling are not cosmetic: the ablations show each piece contributes, and the conclusion frames the model as a path toward less manual tongue-contour extraction in SSI pipelines.",
      "expert_true_value": "A strong SSI paper because it turns ultrasound tongue video into a high-quality articulatory-to-acoustic mapping problem with convincing quantitative gains.",
      "canon_before": "Ultrasound SSI work often depended on handcrafted tongue features or weaker sequence models.",
      "delta_from_canon": "U2F uses hybrid 2D spatial and 1D temporal convolutions with shuffling to learn end-to-end formant tracking from raw ultrasound clips.",
      "position_in_field": "Core ultrasound-based SSI work for speech restoration.",
      "practical_value": "Useful as a full-text-backed reference for ultrasound-to-acoustic mapping and articulatory feature learning.",
      "axes_moved": "ultrasound_ssi; articulatory_to_acoustic_mapping; tongue_tracking",
      "axes_unresolved": "broader speech coverage and real-time patient deployment remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "The reported system focuses on formant tracking and synthesized vowel trajectories rather than full open-vocabulary speech reconstruction.",
      "evaluation_limits": "Evaluation is on the collected ultrasound dataset and does not establish cross-speaker clinical deployment.",
      "deployment_limits": "The work motivates SSI use but does not present a real-time deployed device.",
      "scope_limits": "Ultrasound tongue-image to formant / synthesized speech pipeline.",
      "task": "speech-reconstruction",
      "input_modality": "ultrasound tongue image sequences",
      "sensor_hardware": "ultrasound probe",
      "body_site": "tongue; oral-cavity",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "The best U2F configuration reaches mean R2 99.96 on joint f1-f2 prediction, versus 90.01 for the Conv-BiLSTM baseline on the same joint task.",
      "evaluation_mode": "train-dev-test split on ultrasound videos with MAE and mean R2 plus baseline and ablation comparisons",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract says U2F maps ultrasound tongue images to formant trajectories and then synthesizes continuous vowel trajectories with a Klatt synthesizer.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images-ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-trac.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images-ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-trac.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "Table 1 reports the best U2F configuration at mean R2 99.96 on joint f1-f2 prediction, outperforming the recurrent and standard 3D CNN baselines.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images-ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-trac.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images-ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-trac.txt",
          "section_or_location": "Table 1. Performance comparison with baseline methods",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The discussion lists the hybrid spatial-temporal feature extraction and channel shuffling as the central architectural contribution.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images-ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-trac.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images-ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-trac.txt",
          "section_or_location": "5    Discussion and Conclusion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.92,
          "statement": "The paper evaluates end-to-end mapping to formant frequencies and vowel trajectories, not a full open-vocabulary speech restoration benchmark.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images-ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-trac.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-tracking-from-ultrasound-tongue-images-ultra2speech-a-deep-learning-framework-for-formant-frequency-estimation-and-trac.txt",
          "section_or_location": "5    Discussion and Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task",
      "slug": "application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task",
      "title": "Application of Just-Noticeable Difference in Quality as Environment Suitability Test for Crowdsourcing Speech Quality Assessment Task",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "[''",
        "'']"
      ],
      "url": "https://nao-ki-mura.com/paper/application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task",
      "doi": "10.1109/QoMEX48832.2020.9123093",
      "doi_url": "https://doi.org/10.1109/QoMEX48832.2020.9123093",
      "arxiv_id": "2004.05502",
      "arxiv_url": "https://arxiv.org/abs/2004.05502",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "output:labels",
        "evaluation:quantitative"
      ],
      "expert_take_short": "Strong crowdsourcing methodology paper, not SSI.",
      "expert_take_long": "The full text supports a practical claim: a short JNDQ-based gate can distinguish better and worse remote listening environments before crowd MOS collection. The strongest result is methodological rather than algorithmic, with the paper quantifying how stricter versus more lenient screening changes correlation to laboratory MOS and rejection rates. That is valuable for speech-quality experiments, but it has no direct SSI sensing or reconstruction contribution.",
      "expert_true_value": "Useful screening method for crowdsourced speech-quality studies, but not an SSI paper.",
      "canon_before": "Crowdsourced speech-quality studies had limited control over participant playback environment and no lightweight suitability screen.",
      "delta_from_canon": "Introduces a modified JNDQ gate that screens playback device and background-noise suitability before MOS collection.",
      "position_in_field": "Crowdsourcing methodology paper outside SSI core scope.",
      "practical_value": "Can improve the reliability-cost tradeoff of remote speech-quality studies by rejecting unsuitable listening environments.",
      "axes_moved": "evaluation; system_design",
      "axes_unresolved": "Optimal retest frequency; broader generalization across platforms and tasks",
      "axes_regressed": "",
      "technical_limits": "Not continuous monitoring; the environment can change after the screening step, and inserting the test too often increases session time.",
      "evaluation_limits": "Findings are tied to the tested JND levels, degradation conditions, and the specific crowdsourcing setup.",
      "deployment_limits": "Useful only as an evaluation-control mechanism, not an SSI deployment component.",
      "scope_limits": "Crowdsourced speech-quality environment screening only.",
      "task": "crowdsourcing environment screening",
      "input_modality": "speech audio",
      "sensor_hardware": "listener playback device + headphone/speaker setup",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Highest correlation to laboratory MOS came from JND 6 dB with at least 3 of 4 answers correct; the lenient JND 10 dB with at least 1 of 4 answers correct failed only 15% of answers versus 61% for the strict setup",
      "evaluation_mode": "laboratory and crowdsourcing subjective evaluation with PCC, SRCC, and RMSE against laboratory MOS",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.98,
          "statement": "The paper argues that a properly designed JNDQ test can distinguish noisy environments from silent conditions for crowdsourced speech-quality assessment once the listening device is known.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task-application-of-just-noticeable-difference-in-quality-as-environment-suitability-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task-application-of-just-noticeable-difference-in-quality-as-environment-suitability-.txt",
          "section_or_location": "IV. D ISCUSSION AND CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "The crowdsourcing evaluation reports that the strict JND 6 dB and 3-of-4-correct setup achieved the highest correlation to laboratory MOS, while the lenient JND 10 dB and 1-of-4-correct setup was more cost efficient with only about 15% failed answers versus 61% in the strict case.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task-application-of-just-noticeable-difference-in-quality-as-environment-suitability-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task-application-of-just-noticeable-difference-in-quality-as-environment-suitability-.txt",
          "section_or_location": "B. Crowdsourcing evaluation",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.95,
          "statement": "The discussion states that the modified JNDQ test only evaluates suitability at a specific time, so frequent insertion increases session duration while infrequent testing misses environment changes.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task-application-of-just-noticeable-difference-in-quality-as-environment-suitability-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_application-of-just-noticeable-difference-in-quality-as-environment-suitability-test-for-crowdsourcing-speech-quality-assessment-task-application-of-just-noticeable-difference-in-quality-as-environment-suitability-.txt",
          "section_or_location": "IV. D ISCUSSION AND CONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_vocoder-based-speech-synthesis-from-silent-videos",
      "slug": "vocoder-based-speech-synthesis-from-silent-videos",
      "title": "Vocoder-Based Speech Synthesis from Silent Videos",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Daniel Michelsanti",
        "Olga Slizovskaia",
        "Gloria Haro",
        "Emilia Gomez",
        "Zheng-Hua Tan",
        "Jesper Jensen"
      ],
      "url": "https://nao-ki-mura.com/paper/vocoder-based-speech-synthesis-from-silent-videos",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2004.02541",
      "arxiv_url": "https://arxiv.org/abs/2004.02541",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "deployment:real-time",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "A notable step forward in lip-to-speech synthesis by predicting full vocoder features and jointly training for recognition, achieving strong speaker-dependent results but lacking unseen speaker generalization.",
      "expert_take_long": "This paper proposes a strong and interpretable baseline for video-to-speech synthesis by predicting a full set of WORLD vocoder parameters from silent videos using deep learning. The system employs a video encoder and GRU recursive module to regress spectral envelope, fundamental frequency, aperiodic parameters, and voiced/unvoiced decisions. Joint multi-task learning with an auxiliary visual speech recognition decoder provides measurable improvements. Evaluation on the GRID corpus shows that mouth-only input performs better than full-face input for reconstruction, and speaker-dependent results significantly surpass previous GAN-based methods with PESQ up to 1.90 and ESTOI 0.455, accompanied by low WER. However, generalization to unseen speakers remains a significant challenge, with PESQ dropping to 1.23 and WER rising above 50%. The closed vocabulary and controlled conditions of GRID limit deployment readiness. Overall, the paper advances video-to-speech by combining full vocoder parameter prediction with auxiliary recognition in a multi-task framework, outperforming prior approaches in speaker-dependent settings while highlighting the need for improved generalization.",
      "expert_true_value": "Demonstrates that full vocoder parameter prediction from video plus auxiliary speech recognition creates a practical baseline with better quality and intelligibility than prior partial feature or direct waveform methods.",
      "canon_before": "Prior video-to-speech systems predicted partial acoustic features or directly generated waveforms causing artifacts, with vocoder fundamental frequency and aperiodic parameters often synthesized or omitted.",
      "delta_from_canon": "Estimates all vocoder features (SP, F0, AP) directly from raw video frames with optional VSR auxiliary task improving reconstruction; uses deep video encoder and GRU-based recursive module with dedicated decoders for each vocoder parameter.",
      "position_in_field": "Provides a well-designed full vocoder parameter lip-to-speech baseline illustrating limits of speaker-independent generalization on GRID with multi-task VSR training.",
      "practical_value": "Useful as an interpretable baseline for controlled speaker-dependent video-to-speech applications, highlighting benefits of predicting full vocoder parameters.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Speaker-independent generalization remains weak with large performance drops compared to speaker-dependent conditions.",
      "axes_regressed": "",
      "technical_limits": "Restricted to closed vocabulary GRID, speaker-dependent models perform well but speaker-independent results degrade sharply, limiting real-world application.",
      "evaluation_limits": "Evaluation performed only on GRID corpus with fixed sentence grammar under controlled speaker-dependent and independent splits; no tested generalization to open vocabulary, noisy, or varied environments.",
      "deployment_limits": "Limited to closed-vocabulary GRID sentences, requires frontal silent video input; speaker-independent performance is significantly worse, limiting generalization to unseen speakers and real-world environments.",
      "scope_limits": "Silent frontal video on GRID corpus with speaker-dependent and independent protocols; closed English vocabulary fixed sentence grammar; not tested for noisy or in-the-wild conditions.",
      "task": "speech-reconstruction from silent video",
      "input_modality": "video (silent frontal face or mouth region)",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "closed-vocabulary English sentences",
      "vocabulary_size": "GRID dataset vocabulary with fixed sentence structure of 6 words",
      "metrics": "Speaker-dependent vid2voc with VSR achieves PESQ 1.90, ESTOI 0.455, WER 15.1%; speaker-independent version drops to PESQ 1.23, ESTOI 0.227, WER 51.6%.",
      "evaluation_mode": "Objective speech quality (PESQ), intelligibility (ESTOI), and word error rate (WER) from auxiliary VSR system measured under speaker-dependent and speaker-independent settings.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We present a way to synthesise speech from the silent video of a talker using deep learning that learns a mapping from raw video frames to acoustic features and reconstructs speech with a vocoder synthesis algorithm, trained also to predict text in a multi-task fashion for simultaneous speech reconstruction and recognition in real time.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "Our method differs from prior work by predicting all vocoder features (spectral envelope, fundamental frequency, and aperiodic parameters) directly from raw video frames rather than only spectral envelope or estimating waveforms directly, enabling focus on speech intelligibility and quality and outperforming prior GAN-based approaches.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "1. Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Experiments are conducted on the GRID corpus consisting of audio and video from 34 speakers uttering 1000 six-word closed vocabulary sentences, with evaluations in speaker-dependent and speaker-independent settings using distinct training, validation, and test partitions per setting.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "2.1. Audio",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "The system is evaluated using PESQ for speech quality, ESTOI for intelligibility, and WER from the auxiliary VSR decoder's CTC-based transcription in speaker-dependent and speaker-independent conditions, with higher PESQ/ESTOI and lower WER indicating better results.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "2.5. Evaluation Metrics",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "In speaker-dependent experiments, the mouth-input system with VSR achieves PESQ 1.90, ESTOI 0.455, and WER 15.1%, outperforming prior GAN-based and bottleneck feature methods; speaker-independent results degrade to PESQ 1.23, ESTOI 0.227, and WER 51.6%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "3. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Performance drops substantially in the speaker-independent scenario, showing weaker generalization to unseen speakers with a large spread in scores due to differing facial characteristics and speech traits.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "3.2. Speaker Independent Case",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "The approach requires silent frontal face or mouth region video input, relies on the controlled closed-vocabulary GRID dataset, and shows limited ability to generalize to unseen speakers, constraining its applicability in real-world settings.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The system uses a deep network mapping silent video frames concatenated in temporal context through 3-D convolutions, followed by a GRU-based recursive module and separate decoders for spectral envelope, aperiodic parameters, voiced/unvoiced state, fundamental frequency, and auxiliary VSR output decoded with CTC loss to jointly learn speech reconstruction and recognition.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "2.3. Architecture",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The system reconstructs speech waveforms from predicted WORLD vocoder parameters using the STRAIGHT vocoder synthesis algorithm and obtains text transcriptions from the auxiliary VSR decoder by best path CTC decoding.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "2.4. Waveform Reconstruction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "Previous methods predicted incomplete spectral features or direct waveforms causing artifacts; this work instead estimates spectral envelope, F0, and aperiodic parameters, enabling better speech intelligibility and quality.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vocoder-based-speech-synthesis-from-silent-videos-vocoder-based-speech-synthesis-from-silent-videos.txt",
          "section_or_location": "1. Introduction",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_continuous-silent-speech-recognition-using-eeg",
      "slug": "continuous-silent-speech-recognition-using-eeg",
      "title": "Continuous Silent Speech Recognition using EEG",
      "year": 2020,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Gautam Krishna",
        "Co Tran",
        "Mason Carnahan",
        "Ahmed H Tewfik"
      ],
      "url": "https://nao-ki-mura.com/paper/continuous-silent-speech-recognition-using-eeg",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "2002.03851",
      "arxiv_url": "https://arxiv.org/abs/2002.03851",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "deployment:hands-free",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "modality:eeg",
        "output:text",
        "task:speech-recognition"
      ],
      "expert_take_short": "Real EEG sentence-level silent speech recognition is demonstrated but at very high WER, confirming feasibility only and underscoring the immature state of current EEG silent speech technology.",
      "expert_take_long": "This paper represents an important proof-of-concept for continuous silent speech recognition from real EEG signals. Using a comprehensive recording from 31 scalp sensors, handcrafted EEG features, KPCA compression, and a GRU+TCN CTC-based ASR model, the authors demonstrate decoding of silently read English sentences. Despite the novelty of continuous sentence-level decoding, performance remains far from practical, with within-subject WER averaging 83.34% and cross-subject WER degrading further to 92.55%. The small dataset of 30 sentences from four subjects limits robustness and generalizability. Real-time capability is unreported. Nonetheless, the work expands the experimental scope of EEG silent speech research, setting a challenging baseline and highlighting substantial challenges ahead for real deployment.",
      "expert_true_value": "The work extends EEG silent speech research from small-vocabulary or passive listening setups to continuous sentence recognition with real EEG, providing a candid benchmark of high error rates and subject variability that clarifies feasibility and informs future model and dataset design.",
      "canon_before": "Prior EEG silent speech work mostly focused on isolated commands, small vocabularies, or passive listening rather than imagined sentence reading with continuous decoding.",
      "delta_from_canon": "Expansion from discrete word or command recognition to continuous sentence-level EEG silent speech decoding with CTC-based deep model and KPCA feature compression.",
      "position_in_field": "A significant EEG silent speech reference for continuous sentence decoding that underscores current technological and data limitations.",
      "practical_value": "Provides an important baseline for continuous silent speech decoding from EEG and encourages research toward improved modeling and datasets; currently not practical for real-world use.",
      "axes_moved": "Evaluation: single-word or small-vocabulary EEG silent speech recognition to continuous sentence decoding; Modality: EEG signal processing with feature engineering and deep CTC ASR.",
      "axes_unresolved": "Cross-subject transfer robustness and real-time system feasibility remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "Very small dataset; high WER; limited subject pool; lack of cross-subject robustness; no real-time validation; requires full 31-channel EEG cap; no wearable or practical signal capture setup.",
      "evaluation_limits": "Only 30 unique English sentences from four subjects; random 80/20 train/test split; WER metrics without unseen-word or cross-environment testing; no walking tested; no cross-device evaluation.",
      "deployment_limits": "Very high word error rate (WER), poor subject generalization (cross-subject WER 92.55%), requirement of full 31-channel EEG cap, lack of real-time results, and small dataset size.",
      "scope_limits": "Demonstration limited to 30 unique silently read sentences across four subjects, no environmental or cross-device variability.",
      "task": "Continuous silent speech recognition from EEG.",
      "input_modality": "31-channel scalp EEG recorded while subjects silently read sentences mentally.",
      "sensor_hardware": "32-electrode (31 EEG + ground) wet EEG cap with sensors placed according to standard 10-20 montage.",
      "body_site": "brain",
      "output_type": "Text (decoded English sentences)",
      "vocabulary_type": "English read sentences silently (imagined speech).",
      "vocabulary_size": "30 unique sentences, 72 total sentences used in testing.",
      "metrics": "Word error rate (WER) averages from ~74.86% to 84.22% across 12 to 72 sentence test sets; cross-subject WER at 92.55%.",
      "evaluation_mode": "Test set WER computed across varying vocabulary sizes and cross-subject conditions with character-level CTC decoding and external 4-gram language model.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper demonstrates the feasibility of using EEG signals for continuous silent speech recognition while subjects silently read English sentences without vocalizing.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The paper introduces a continuous sentence-level silent speech recognition approach from real EEG data using a character-level CTC automatic speech recognition model, distinct from prior small-vocabulary or passive-listening EEG silent speech studies.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "section_or_location": "1. Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "EEG was recorded from four male subjects using a 32-electrode (31 channels plus ground) wet EEG cap placed according to standard 10-20 system at 1000 Hz sampling frequency while subjects silently read 30 USC-TIMIT English sentences.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "section_or_location": "3. Design of Experiments for building the ing rate,moving window average,kurtosis and power spectral",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The EEG features extracted per channel are root mean square, zero crossing rate, moving window average, kurtosis, and power spectral entropy, totalling 155 features over 31 channels, which are reduced to 20 dimensions using Kernel Principal Component Analysis (KPCA) with polynomial kernel of degree 3.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "section_or_location": "4. EEG feature extraction details",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The automatic speech recognition (ASR) model is based on Connectionist Temporal Classification (CTC) with an encoder of two GRU layers (128 and 64 units) followed by a Temporal Convolutional Network (TCN) with 32 filters and a decoder with softmax output for character-level transcription.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "section_or_location": "Connectionist Temporal Classification (CTC)",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Test-time word error rates (WER) ranged approx. from 74.86% to 84.22% for vocabulary sizes from 12 to 72 sentences (30 unique), with a WER of 83.34% on the 72 sentence test set; cross-subject WER rises to 92.55%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "section_or_location": "6. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Limitations include the small dataset (four subjects, 30 unique sentences), poor accuracy indicated by very high WER, and weak generalization across subjects evidenced by increased cross-subject WER.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "section_or_location": "6. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "The approach is not presently suitable for deployment given high WER, requirement of full 31-channel EEG cap, and lack of real-time decoding results.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_continuous-silent-speech-recognition-using-eeg-continuous-silent-speech-recognition-using-eeg.txt",
          "section_or_location": "7. Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings",
      "slug": "brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings",
      "title": "Brain2Char: A Deep Architecture for Decoding Text from Brain Recordings",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Pengfei Sun",
        "Gopala K. Anumanchipalli",
        "Edward F. Chang"
      ],
      "url": "https://nao-ki-mura.com/paper/brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1909.01401",
      "arxiv_url": "https://arxiv.org/abs/1909.01401",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "output:text",
        "task:speech-recognition"
      ],
      "expert_take_short": "Brain2Char establishes a new state-of-the-art for continuous character decoding from invasive ECoG with competitive WER on large vocabularies and silent speech, demonstrating feasibility for communication BCIs.",
      "expert_take_long": "Brain2Char represents a significant step forward from earlier work limited to phoneme classification or small vocabularies by demonstrating continuous decoding of character sequences from ECoG recordings during overt and silent speech. The proposed architecture integrates multi-scale convolutional encoders and sequential recurrent and dilated CNN decoders with auxiliary physiological regularization and session calibration, yielding 7-11% WER on vocabularies up to 1900 words in 3 participants. Silent speech is decoded at higher WER (~40-67%) in two subjects. Remaining challenges include invasiveness, the need for per-subject calibration, limited vocabulary, and generalization beyond the training participants. Nevertheless, this system sets a new benchmark for neural speech recognition from direct brain signals and is promising for communication BCIs especially in clinical populations unable to speak.",
      "expert_true_value": "Provides the first demonstration of sentence-level brain-to-character decoding with modern neural sequence models and independent evaluation on multiple subjects, going beyond toy imagined-word classification models toward realistic speech recognition from brain signals.",
      "canon_before": "Prior brain-to-text decoding from ECoG data had not achieved continuous sentence-level character decoding with competitive WER on vocabulary sizes above approximately 50 words, often limited to phoneme or word classification.",
      "delta_from_canon": "Introduces a modular architecture combining 3D multi-scale inception convolutional encoder, bidirectional LSTM layers, dilated CNN decoder layers with CTC loss and language model beam search, plus latent feature regularization using speech acoustics, articulatory kinematics, and session embeddings.",
      "position_in_field": "Strong benchmark in sentence-level ECoG-based brain text decoding with larger vocabularies and physiological regularization.",
      "practical_value": "Potentially useful for clinical communication BCIs once invasiveness and calibration challenges are addressed.",
      "axes_moved": "evaluation; system_design",
      "axes_unresolved": "cross-subject transfer; larger vocabulary decoding",
      "axes_regressed": "",
      "technical_limits": "Limited dataset size; invasive sensor modality; session to session neural variability; vocabulary limited to 1200-1900 words; no cross-subject transfer; silent speech decoding at higher error rates.",
      "evaluation_limits": "Evaluation restricted to 4 participants with invasive ECoG, limited sentence sets, and no cross-subject testing or long-term deployment; silent speech tested only on two participants with about 20 sentences each; no walking or mobile scenarios assessed.",
      "deployment_limits": "Invasive ECoG recording modality requiring clinical implantation; calibration needed per subject and per session; vocabulary sizes remain limited for realistic free text use; no real-world deployment tested.",
      "scope_limits": "Sentence-level decoding from invasive ECoG collected during speech tasks in 4 participants; no non-invasive or cross-subject generalization assessed.",
      "task": "speech-recognition",
      "input_modality": "electrocorticography (ECoG) invasive brain recordings",
      "sensor_hardware": "16x16 and 16x8 electrode ECoG grids implanted on ventral sensorimotor cortex, inferior frontal gyrus, superior temporal gyrus",
      "body_site": "brain",
      "output_type": "text",
      "vocabulary_type": "sentence-level English",
      "vocabulary_size": "1200 - 1900 words",
      "metrics": "Word Error Rate (WER): 10.6%, 8.5%, and 7.0% for three participants on vocabularies from 1200 to 1900 words; Silent speech decoding WER: ~40% and 67% for two subjects on 20 sentences.",
      "evaluation_mode": "Experimental study with neural signals and synchronous audio-text; train/test splits with increasing data sizes; partial neural data tests; silent speech trials.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "Brain2Char achieves low word error rates (10.6%, 8.5%, and 7.0%) decoding character sequences directly from ECoG brain recordings with vocabularies of 1200 to 1900 words.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.9,
          "statement": "Brain2Char combines 3D inception layers for multi-band spatiotemporal ECoG feature extraction, bi-directional LSTM layers, dilated CNN layers with CTC loss, and language model beam search to decode text from brain signals.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "section_or_location": "1.1 Brain2Char Architecture",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.85,
          "statement": "Evaluation performed on ECoG recordings from 4 participants reading prompted sentences with vocabularies between 400 and 1900 words, recorded over several sessions; also tested silent mimed speech in 2 participants with acceptable error rates.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "section_or_location": "2 Experimental Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.9,
          "statement": "Reported word error rates of Brain2Char reach as low as 7.0% for one participant and 10-11% for others on vocabulary tasks of 1200-1900 words; silent speech decoding WER is around 40-67%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "section_or_location": "2 Experimental Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.85,
          "statement": "Brain2Char is limited by reliance on invasive ECoG sensor hardware, session-specific calibration, and limited dataset per participant; generalization to other subjects or larger vocabularies remains unproven.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "section_or_location": "1 Neural Speech Recognition from ECoG",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.8,
          "statement": "Although Brain2Char demonstrates hands-free, incremental, and real-time capable decoding, deployment for practical communication BCIs is limited by invasiveness and need for extensive calibration.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings-brain2char-a-deep-architecture-for-decoding-text-from-brain-recordings.txt",
          "section_or_location": "3 Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed",
      "slug": "demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed",
      "title": "Demucs: Deep Extractor for Music Sources with extra unlabeled data remixed",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Alexandr Defossez",
        "Nicola Usunier",
        "Léon Bottou",
        "Francis Bach"
      ],
      "url": "https://nao-ki-mura.com/paper/demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1909.01174",
      "arxiv_url": "https://arxiv.org/abs/1909.01174",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "evaluation:structured-benchmark"
      ],
      "expert_take_short": "This work delivers an improved waveform source separation model combined with a novel remix-based semi-supervised learning scheme using unlabeled music. Though not related to silent speech, it advances music separation benchmarks by closing gaps to spectrogram methods.",
      "expert_take_long": "This paper presents Demucs, a novel waveform-based deep learning architecture for music source separation that bridges much of the performance gap to spectrogram-based methods. The key innovations include an encoder-decoder with GLU activations, bidirectional LSTM in the bottleneck, and a remixing-based weak supervision technique using unlabeled music data. Evaluations on the MusDB benchmark show that Demucs surpasses prior waveform methods like Wave-U-Net, and benefits from unlabeled data remix augmentation to approach state-of-the-art spectrogram models. However, the scope is strictly music source separation and does not extend to silent speech or real-time mobile deployment. The paper provides a practical waveform baseline and new semi-supervised approach, but the generalization beyond supervised music datasets remains untested.",
      "expert_true_value": "A practical and effective waveform source-separation architecture enhanced with remix semi-supervision from unlabeled music, demonstrating viability of waveform methodologies in music separation.",
      "canon_before": "Music source separation mostly relied on spectrogram masking with limited waveform-domain performance.",
      "delta_from_canon": "Introduces direct waveform-domain separation with the Demucs architecture and remix-based augmentation using unlabeled data for semi-supervision.",
      "position_in_field": "Waveform source separation approach advancing music source separation benchmarking, outside silent speech domain.",
      "practical_value": "Useful for music production and audio engineering source separation tasks, with no direct SSI utility.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Generalization beyond benchmark music source separation datasets",
      "axes_regressed": "",
      "technical_limits": "Performance bounded to benchmark music datasets; needs large labeled or well-curated unlabeled data; no silent speech adaptation.",
      "evaluation_limits": "Benchmark limited to MusDB and unlabeled music datasets; evaluation focuses on SDR in the standard SiSec framework.",
      "deployment_limits": "Limited to offline music source separation; no provision for real-time, mobile, or silent speech use.",
      "scope_limits": "Limited to music source separation from waveform data; unrelated to silent speech recognition or synthesis.",
      "task": "music source separation",
      "input_modality": "audio",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "separated audio stems",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Median SDR on MusDB test set, quantitative comparison to Wave-U-Net and spectrogram MMDense variants; ablation studies on training and architecture.",
      "evaluation_mode": "Benchmark comparison using standard SiSec MusDB test splits and structured SDR metrics.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "Our contribution is two fold. (i) We introduce a simple convolutional and recurrent model that outperforms the state-of-the-art model on waveforms, that is, Wave-U-Net by 1.6 points of SDR. (ii) We propose a new scheme to leverage unlabeled music. We train a first model to extract parts with at least one source silent in unlabeled tracks and remix this extract with a source from supervised data for weak supervision.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The architecture is composed of a convolutional encoder, an LSTM, and a convolutional decoder with GLU activations and skip U-Net connections and large stride convolutions; it outperforms Wave-U-Net due to larger number of channels and synthesis via transposed convolutions rather than iterative upsampling.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "section_or_location": "3   Model Architecture",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.9,
          "statement": "We use the MusDB dataset composed of 150 songs with separate stems for drums, bass, other, and vocals, divided into 100 training and 50 test songs, along with 2,000 unlabeled additional tracks as unlabeled data for semi-supervised training.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "section_or_location": "5.1       Evaluation framework",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.9,
          "statement": "Performance is reported by median Signal-to-Distortion Ratio (SDR) over the MusDB test set, consistent with standard SiSec Mus evaluation metrics.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "section_or_location": "5.1       Evaluation framework",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.85,
          "statement": "The evaluation is limited to music source separation of four sources on the MusDB benchmark and does not cover silent speech datasets or deployment scenarios.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "section_or_location": "5      Experimental results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "Deployment is limited because the system only targets offline music source separation, without provision for real-time, mobile, or silent speech usage.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "section_or_location": "Conclusion",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The remix-based semi-supervised training scheme uses a classifier trained to detect silence of sources in unlabeled tracks, extracting silent excerpts which are then remixed with isolated sources from supervised data to produce strong and weak supervision losses.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed-demucs-deep-extractor-for-music-sources-with-extra-unlabeled-data-remixed.txt",
          "section_or_location": "4   Unlabeled Data Remixing",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification",
      "slug": "attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification",
      "title": "Attention based Convolutional Recurrent Neural Network for Environmental Sound Classification",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zhichao Zhang",
        "Shugong Xu",
        "Tianhao Qiao",
        "Shunq Zhang"
      ],
      "url": "https://nao-ki-mura.com/paper/attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1907.02230",
      "arxiv_url": "https://arxiv.org/abs/1907.02230",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "modality:acoustic",
        "evaluation:structured-benchmark",
        "output:labels",
        "task:audio-classification"
      ],
      "expert_take_short": "The proposed frame-level attention integrated within a convolutional recurrent network effectively improves environmental sound classification accuracy on ESC benchmarks by focusing on informative temporal frames while suppressing irrelevant or silent ones.",
      "expert_take_long": "This paper presents a convolutional recurrent neural network enhanced with a frame-level attention mechanism to improve environmental sound classification by focusing on semantically meaningful acoustic frames while suppressing silent or noisy segments. Using Log-Gammatone spectrogram features, the method combines CNN layers for spatial feature extraction and bidirectional GRUs for temporal modeling. The frame-level attention can be applied at multiple points, with best performance reported when applied after recurrent layers. Experiments on ESC-10 and ESC-50 datasets show that this attention mechanism notably improves classification accuracy compared to baseline methods, outperforming several recent state-of-the-art approaches. However, the evaluation is limited to these datasets and standard augmentations without robustness assessment in varying environmental conditions or real-time implementation considerations. Although highly relevant for acoustic scene classification benchmarks, this method is peripheral to silent speech interface research, as it does not address speech or articulatory signals directly. The paper makes a valuable contribution as a refined benchmark system for ESC, especially regarding selective temporal frame weighting within deep neural architectures.",
      "expert_true_value": "The work demonstrates the benefit of explicitly modeling temporal frame importance via attention in a unified CRNN, leading to improved feature representation and classification accuracy over uniform frame treatment used in prior ESC models.",
      "canon_before": "Prior ESC approaches typically treat all temporal frames of audio clips uniformly without explicit attention or weighting, and often rely on purely convolutional or recurrent architectures without integrated attention.",
      "delta_from_canon": "Introduces explicit frame-level attention mechanism layers within a convolutional recurrent neural network, enabling selective temporal weighting at multiple network layers (CNN and RNN), which improves the quality of learned feature representations and classification accuracy.",
      "position_in_field": "A notable benchmark paper enhancing environmental sound classification accuracy via integrated attention mechanisms within CRNNs, though peripheral to silent speech interface research.",
      "practical_value": "Practically valuable as a benchmark advancement in environmental sound classification accuracy but lacks validation for silent speech interface applications or real-time deployments.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Generalization beyond the ESC-10 and ESC-50 datasets and robustness to diverse environmental conditions remain unaddressed.",
      "axes_regressed": "",
      "technical_limits": "Limited to 5-second fixed-length audio clips; input feature design depends on Log-Gammatone spectrograms with delta features; no exploration of real-time processing or embedded platform deployment.",
      "evaluation_limits": "Evaluation is limited to ESC-10 and ESC-50 datasets using 5-second audio clips sampled at 44.1 kHz, employing 5-fold cross-validation and reporting primarily classification accuracy metrics. No testing was performed on datasets with real-world noise variability or unseen environmental conditions.",
      "deployment_limits": "The study lacks discussion on real-time processing capabilities, computational resource requirements, or deployment feasibility on embedded or mobile devices. It does not evaluate robustness to unseen noise types or operational environments, limiting immediate practical deployment.",
      "scope_limits": "Focuses solely on environmental sound classification; does not address speech or silent speech interface signals or applications.",
      "task": "audio-classification",
      "input_modality": "acoustic",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Classification accuracy measured with 5-fold cross-validation on ESC-10 and ESC-50 datasets; accuracy gain reported as absolute percentage improvements compared to baselines and other models.",
      "evaluation_mode": "Experimental benchmark evaluation on public ESC datasets with 5-fold cross-validation and augmentation techniques.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.99,
          "statement": "We propose an attention mechanism-based convolutional RNN architecture (ACRNN) to focus on semantically relevant frames and produce discriminative features for ESC, demonstrating state-of-the-art performance on ESC-10 and ESC-50 datasets.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.98,
          "statement": "The model integrates a frame-level attention mechanism applied both at CNN and RNN layers within the convolutional recurrent neural network, allowing selective temporal weighting of features to emphasize semantically relevant sound frames.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "section_or_location": "2.3 Frame-level Attention Mechanism",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.95,
          "statement": "The method was evaluated on ESC-10 and ESC-50 datasets consisting of 5-second audio clips sampled at 44.1 kHz, using 5-fold cross-validation. Results were measured by classification accuracy and confusion matrices.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "section_or_location": "3 Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.93,
          "statement": "Classification accuracy is reported as percentage correct classification on ESC-10 and ESC-50 datasets, with the proposed ACRNN achieving 93.7% on ESC-10 and 86.1% on ESC-50, outperforming prior methods.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "section_or_location": "3 Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "The evaluation does not include testing on unseen environmental conditions or noise profiles, and lacks discussion on real-time applicability or deployment constraints.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_attention-based-convolutional-recurrent-neural-network-for-environmental-sound-classification-attention-based-convolutional-recurrent-neural-network-for-environmental-sound-c.txt",
          "section_or_location": "4 Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading",
      "slug": "lipper-synthesizing-thy-speech-using-multi-view-lipreading",
      "title": "Lipper: Synthesizing Thy Speech using Multi-View Lipreading",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yaman Kumar",
        "Rohit Jain",
        "Khwaja Mohd. Salik",
        "Rajiv Ratn Shah",
        "Yifang Yin",
        "Roger Zimmermann"
      ],
      "url": "https://nao-ki-mura.com/paper/lipper-synthesizing-thy-speech-using-multi-view-lipreading",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1907.01367",
      "arxiv_url": "https://arxiv.org/abs/1907.01367",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "deployment:real-time"
      ],
      "expert_take_short": "Strong multi-view lip-to-speech baseline with honest quality limits.",
      "expert_take_long": "The full text gives Lipper more credit than a quick skim would. The best three-view configuration at 0°, 45°, and 60° materially beats the single-view setups, and Table 11 shows why the authors can plausibly call it near real-time: 0.169 s versus roughly 0.94 to 1.95 s for the speechreading comparison. But the conclusion is equally important, because it openly admits robotic audio, controlled-camera assumptions, and weak speaker-independent behavior. This is a strong baseline, not a solved deployment story.",
      "expert_true_value": "A serious early multi-view video-to-speech system whose importance is the regression framing and practical latency analysis, though audio quality remains robotic and speaker independence is weak.",
      "canon_before": "Most lipreading systems classified phrases or words rather than synthesizing speech directly, and usually from a single view.",
      "delta_from_canon": "Lipper combines multiple camera views, regression-based speech reconstruction, OOV testing, and explicit delay analysis.",
      "position_in_field": "Core multi-view video speech-reconstruction work in SSI-adjacent silent video research.",
      "practical_value": "Useful as a baseline for multi-view lip-to-speech systems, especially when latency and OOV behavior matter.",
      "axes_moved": "multi_view_fusion; regression_framing; latency",
      "axes_unresolved": "speaker independence, naturalness, and in-the-wild robustness remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "The generated audio is still robotic and lip-only sensing cannot recover prosody or full vocal tract information.",
      "evaluation_limits": "The work is confined to controlled OuluVS2 conditions and speaker-independent results remain weak.",
      "deployment_limits": "Real-world pose variation and broader speaker coverage are not solved.",
      "scope_limits": "Multi-view lip-video speech reconstruction only.",
      "task": "speech-reconstruction",
      "input_modality": "multi-view lip video",
      "sensor_hardware": "camera",
      "body_site": "face; lip",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Best three-view setting 0°+45°+60° reaches PESQ 2.315, end-to-end delay stays at 0.169 s across phrases, and user-study accuracy is 80.25% audio-only / 81.25% audio-visual.",
      "evaluation_mode": "speaker-dependent and speaker-independent OuluVS2 PESQ benchmarks, OOV phrase tests, delay comparison, and user study",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract says Lipper reconstructs speech from silent multi-view videos by treating lipreading as a regression task rather than classification.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading-lipper-synthesizing-thy-speech-using-multi-view-lipreading.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading-lipper-synthesizing-thy-speech-using-multi-view-lipreading.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "Table 6 reports the best three-view combination at 0°+45°+60° with PESQ 2.315, higher than any single-view result in Table 4.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading-lipper-synthesizing-thy-speech-using-multi-view-lipreading.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading-lipper-synthesizing-thy-speech-using-multi-view-lipreading.txt",
          "section_or_location": "Table 6",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.96,
          "statement": "Table 11 shows Lipper at 0.169 seconds delay across phrases, substantially below the 0.94 to 1.95 second delays of the speechreading comparison.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading-lipper-synthesizing-thy-speech-using-multi-view-lipreading.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading-lipper-synthesizing-thy-speech-using-multi-view-lipreading.txt",
          "section_or_location": "Table 11",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.94,
          "statement": "The conclusion says the audio remains robotic, the system assumes controlled camera conditions, and speaker-independent performance is not good enough for deployment.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading-lipper-synthesizing-thy-speech-using-multi-view-lipreading.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_lipper-synthesizing-thy-speech-using-multi-view-lipreading-lipper-synthesizing-thy-speech-using-multi-view-lipreading.txt",
          "section_or_location": "Conclusion",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder",
      "slug": "ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder",
      "title": "Ultrasound-based Silent Speech Interface Built on a Continuous Vocoder",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Moham Salah Alradhi",
        "Géza Németh",
        "Gábor Gosztolya",
        "Tamás Gábor Csapó",
        "László Tóth",
        "Alexandra Markó"
      ],
      "url": "https://nao-ki-mura.com/paper/ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1906.09885",
      "arxiv_url": "https://arxiv.org/abs/1906.09885",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "output:speech-audio",
        "evaluation:structured-benchmark",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "The key advancement is continuous F0 tracking via CNNs yielding lower pitch error and slight naturalness improvement over discontinuous F0 pipelines in ultrasound SSI.",
      "expert_take_long": "This paper presents an incremental but meaningful refinement in ultrasound-based SSIs by employing a continuous F0 vocoder predicting ContF0 and MVF directly from ultrasound tongue images via CNNs. The continuous F0 modeling reduces F0 RMSE substantially compared to discontinuous F0 baseline. Subjectively, synthesized speech gains slight, though statistically non-significant, naturalness improvement. The study uses a four-speaker Hungarian dataset with limited utterance duration, thus generalization, real-time deployment beyond controlled settings, and cross-speaker robustness need further exploration. Nevertheless, the method provides a computationally feasible vocoder design that simplifies excitation modeling in SSI and contributes to the field by transitioning from discontinuous to continuous pitch modeling.",
      "expert_true_value": "Demonstrates that continuous pitch estimation from ultrasound articulatory data is feasible and beneficial for SSI vocoders, improving pitch modeling and slightly enhancing speech naturalness, without introducing new sensor modalities.",
      "canon_before": "Prior UTI-based SSI vocoder systems typically predicted discontinuous F0 with a binary voiced/unvoiced classification followed by voiced F0 regression.",
      "delta_from_canon": "Replaces the discontinuous voiced/unvoiced F0 prediction pipeline with continuous F0 interpolation and a continuous vocoder framework predicting ContF0 and MVF parameters.",
      "position_in_field": "A vocoder-focused SSI articulation-to-speech synthesis study focusing on continuous pitch modeling from ultrasound tongue images.",
      "practical_value": "Potentially improves synthesized speech naturalness in ultrasound-based SSI by refining pitch modeling, but practical deployment is limited by dataset size and hardware requirements.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Cross-speaker generalization and robustness beyond the four-speaker UTI dataset used remain unaddressed.",
      "axes_regressed": "",
      "technical_limits": "Limited speaker generalization; linked to fixed ultrasound frame rate for prediction; MVF estimation accuracy varies by speaker; CNN uses single frames rather than consecutive or recurrent context which might improve accuracy.",
      "evaluation_limits": "Evaluated using a small dataset of four speakers; objective metrics include V/UV classification accuracy and F0 RMSE, while subjective MUSHRA tests show minor non-significant gains in naturalness.",
      "deployment_limits": "Requires ultrasound tongue imaging hardware and a CNN-based vocoder pipeline; real-time capabilities are mentioned but deployment is limited by hardware and speaker-dependent training.",
      "scope_limits": "Limited to ultrasound tongue imaging based SSI for speech reconstruction; small four-speaker Hungarian dataset; no speaker-independent evaluation.",
      "task": "speech-reconstruction",
      "input_modality": "ultrasound tongue imaging",
      "sensor_hardware": "Ultrasound tongue imaging system capturing midsagittal tongue ultrasound cine at ~82 fps.",
      "body_site": "tongue",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "V/UV classification accuracy (about 78.8% average), F0 Root Mean Square Error (RMSE) in Hz (continuous F0 ~30.6 Hz, baseline discontinuous ~65.3 Hz), Maximum Voiced Frequency RMSE (654–1177 Hz range depending on speaker), subjective MUSHRA naturalness scores with no significant difference but trend favoring continuous vocoder.",
      "evaluation_mode": "Objective (V/UV accuracy, RMSE) plus subjective listening tests (MUSHRA) by native Hungarian speakers.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "We use a continuous F0 tracker which does not apply a strict voiced/unvoiced decision. Continuous vocoder parameters (ContF0, Maximum Voiced Frequency and Mel-Generalized Cepstrum) are predicted using a convolutional neural network with UTI as input, improving naturalness compared to baseline vocoder using standard discontinuous F0.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.9,
          "statement": "The proposed system uses continuous pitch tracking via interpolation of F0 in unvoiced regions, avoiding the discontinuities caused by strict voiced/unvoiced binary splits common in prior SSI vocoders.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "section_or_location": "2. Continuous F0 modeling within vocoders",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.95,
          "statement": "Objective evaluation shows average V/UV decision accuracy around 78.8%, baseline F0 prediction RMSE about 65 Hz, and continuous F0 prediction RMSE about 30.6 Hz across four speakers, indicating the continuous F0 model yields significantly lower pitch error.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "section_or_location": "4.2. Objective evaluation",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.9,
          "statement": "Data recorded from four Hungarian speakers (2 female, 2 male), about 15-minute recordings each composed of 209 sentences, split into training/validation/test sets; evaluation on 9 test sentences per speaker; speaker-dependent CNN models trained separately.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "section_or_location": "3.1. Data acquisition",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.9,
          "statement": "Subjective MUSHRA listening tests conducted with 23 native Hungarian listeners comparing baseline and continuous vocoder synthetic speech; results indicate slightly better but not statistically significant naturalness in continuous vocoder output.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "section_or_location": "4.3. Subjective listening test",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.85,
          "statement": "The system uses speaker-dependent models trained on ultrasound data recorded at fixed frame rate (~82 fps); the CNN input are single ultrasound images without recurrent context, which limits generalization and temporal modeling; cross-speaker and broader corpus robustness are not demonstrated.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "section_or_location": "3.4. DNN training with the baseline vocoder",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "The vocoder design uses low-dimensional excitation parameters (ContF0 and MVF) predicted by CNNs from ultrasound images, enabling computationally feasible speech synthesis potentially suitable for real-time operation, though adaptation to wearable or mobile platforms and speaker-independent models are not addressed.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder-ultrasound-based-silent-speech-interface-built-on-a-continuous-vocoder.txt",
          "section_or_location": "5. Conclusions",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks",
      "slug": "video-driven-speech-reconstruction-using-generative-adversarial-networks",
      "title": "Video-Driven Speech Reconstruction using Generative Adversarial Networks",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Konstantinos Vougioukas",
        "Pingchuan Ma",
        "Stavros Petridis",
        "Maja Pantic"
      ],
      "url": "https://nao-ki-mura.com/paper/video-driven-speech-reconstruction-using-generative-adversarial-networks",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1906.06301",
      "arxiv_url": "https://arxiv.org/abs/1906.06301",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction",
        "evaluation:quantitative",
        "evaluation:structured-benchmark"
      ],
      "expert_take_short": "Foundational direct video-to-audio result with clear generalization limits.",
      "expert_take_long": "The full text justifies why this paper matters: it pushes past intermediate-feature pipelines and gets to direct raw-audio generation with intelligible outputs on GRID. Table 2 shows a meaningful intelligibility win over Lip2AudSpec even though PESQ is slightly worse, and the ablation study makes clear the perceptual and adversarial losses are doing real work. The same text also keeps the review honest: unseen speakers degrade sharply, voice identity can morph, and the method only handles frontal faces.",
      "expert_true_value": "A foundational lip-to-speech paper because it demonstrates intelligible direct audio generation from silent video, but the unseen-speaker degradation is substantial and the method is frontal-view only.",
      "canon_before": "Earlier video-to-speech pipelines often relied on intermediate speech features or text and were mostly speaker-dependent.",
      "delta_from_canon": "This model learns direct silent-video to raw-audio synthesis with GAN and perceptual losses and evaluates both seen and unseen speakers.",
      "position_in_field": "Core early video-driven speech reconstruction work.",
      "practical_value": "Useful as a historically important baseline for direct video-to-audio synthesis.",
      "axes_moved": "direct_audio_generation; lip_to_speech; synchrony",
      "axes_unresolved": "unseen-speaker fidelity, broader visual conditions, and natural voice quality remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "Artifacts remain, unseen-speaker voice consistency is poor, and the model is restricted to frontal faces.",
      "evaluation_limits": "All reported experiments are on GRID, which constrains linguistic and visual diversity.",
      "deployment_limits": "In-the-wild pose variation and real-world deployment are future work.",
      "scope_limits": "Frontal silent-video speech reconstruction only.",
      "task": "speech-reconstruction",
      "input_modality": "silent frontal face video",
      "sensor_hardware": "camera",
      "body_site": "face; lip",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "In the speaker-dependent setup the model reports WER 26.6%, STOI 0.518, MCD 22.29, and AV confidence 4.4 with one-frame offset; unseen speakers drop to WER 40.5% and PESQ 1.24.",
      "evaluation_mode": "GRID speaker-dependent and speaker-independent evaluation using PESQ, WER, AV synchrony, STOI, and MCD plus ablations",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.97,
          "statement": "The abstract says the model directly synthesizes audio from silent video and evaluates both speaker-dependent and speaker-independent scenarios on GRID.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks-video-driven-speech-reconstruction-using-generative-adversarial-networks.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks-video-driven-speech-reconstruction-using-generative-adversarial-networks.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.97,
          "statement": "Table 2 reports speaker-dependent WER 26.6%, STOI 0.518, MCD 22.29, and AV confidence 4.4 with one-frame offset for the proposed model.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks-video-driven-speech-reconstruction-using-generative-adversarial-networks.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks-video-driven-speech-reconstruction-using-generative-adversarial-networks.txt",
          "section_or_location": "Table 2",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The ablation study says the adversarial loss is necessary for speech production and the perceptual loss is critical for preserving content.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks-video-driven-speech-reconstruction-using-generative-adversarial-networks.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks-video-driven-speech-reconstruction-using-generative-adversarial-networks.txt",
          "section_or_location": "Table 3",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.94,
          "statement": "The conclusion says unseen speakers still produce degraded voices and that the method operates solely on frontal faces, leaving in-the-wild video as future work.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks-video-driven-speech-reconstruction-using-generative-adversarial-networks.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_video-driven-speech-reconstruction-using-generative-adversarial-networks-video-driven-speech-reconstruction-using-generative-adversarial-networks.txt",
          "section_or_location": "5           Conclusions",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method",
      "slug": "a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method",
      "title": "A Novel Task-Oriented Text Corpus in Silent Speech Recognition and its Natural Language Generation Construction Method",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Dong Cao",
        "Dongdong Zhang",
        "HaiBo Chen"
      ],
      "url": "https://nao-ki-mura.com/paper/a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1905.01974",
      "arxiv_url": "https://arxiv.org/abs/1905.01974",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:brain",
        "modality:eeg",
        "output:text",
        "task:dataset"
      ],
      "expert_take_short": "Useful EEG-SSR corpus framing paper, but evidence is lighter than a full benchmark paper.",
      "expert_take_long": "The full text supports the paper as a corpus-construction argument, not as a mature SSR system result. Its strongest move is to reject open-domain language coverage and instead build a life-support-oriented corpus with controlled seed templates and neural diversification. The weakness is equally clear: the extracted paper states that the hybrid approach beats pure methods, but it does not surface the quantitative table needed to judge the margin, so the review should stay scoped to dataset design.",
      "expert_true_value": "Its value is dataset framing rather than decoder performance: it is trying to make EEG SSR data collection tractable by shrinking the language problem.",
      "canon_before": "EEG-based SSR lacked a consensus text corpus, making large paired EEG-text collection prohibitively expensive.",
      "delta_from_canon": "The paper narrows the target domain to life-support conversations and uses hybrid NLG to grow a structured SSR corpus from a controlled seed set.",
      "position_in_field": "SSI-adjacent corpus paper for EEG-based SSR.",
      "practical_value": "Useful if the goal is bootstrapping domain-specific EEG-text data collection, not demonstrating end-to-end SSR decoding.",
      "axes_moved": "dataset_design; domain_restriction; eeg_ssr_bootstrapping",
      "axes_unresolved": "actual decoder gains and transfer beyond the task-oriented domain remain unresolved.",
      "axes_regressed": "",
      "technical_limits": "The paper does not itself solve EEG decoding and relies on a narrow task-oriented domain.",
      "evaluation_limits": "Quantitative evidence is weak in the extracted full text, so comparative strength cannot be audited tightly.",
      "deployment_limits": "No deployed SSR system is shown.",
      "scope_limits": "Corpus construction for EEG-based SSR only.",
      "task": "dataset",
      "input_modality": "EEG-oriented SSR text corpus design",
      "sensor_hardware": "EEG headset",
      "body_site": "brain",
      "output_type": "text",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "The full text claims the hybrid method outperforms pure template-based and pure neural NLG approaches in SSR experiments, but the extracted paper text does not expose a numeric benchmark table.",
      "evaluation_mode": "methodological corpus construction with qualitative and comparative SSR experiment claims",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.96,
          "statement": "The abstract says the paper constructs a task-oriented text corpus for SSR and proposes a hybrid construction method based on natural language generation.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method-a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-l.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method-a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-l.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.93,
          "statement": "Section 3.2 defines the hybrid workflow: build a small high-quality seed corpus and then expand it with a neural generation model.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method-a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-l.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method-a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-l.txt",
          "section_or_location": "3.2 Task-Oriented Hybrid Models",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.88,
          "statement": "Section 4 says the hybrid model outperforms pure template-based and pure neural NLG methods in SSR experiments, but the extracted text does not expose numeric margins.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method-a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-l.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method-a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-l.txt",
          "section_or_location": "4. RESULTS AND ANALYSIS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.93,
          "statement": "Section 2 explains that EEG-based SSR lacks ready-made EEG-text pair datasets and that broad open-domain collection is prohibitively expensive, which is why the paper narrows to task-oriented scenes.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method-a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-l.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-language-generation-construction-method-a-novel-task-oriented-text-corpus-in-silent-speech-recognition-and-its-natural-l.txt",
          "section_or_location": "2. TEXT CORPUS FOR SSR",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces",
      "slug": "autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces",
      "title": "Autoencoder-Based Articulatory-to-Acoustic Mapping for Ultrasound Silent Speech Interfaces",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Gábor Gosztolya",
        "MTA-SZTE Research Group on Artificial Intelligence",
        "University of Szeged",
        "Hungary",
        "Gábor Infusz",
        "ginfusz@inf.u-szeged.hu",
        "Ádám Pintér",
        "Institute of Informatics",
        "University of Szeged",
        "Szeged",
        "Hungary",
        "László Tóth",
        "Institute of Informatics",
        "University of Szeged",
        "Szeged",
        "Hungary",
        "Alexandra Markó",
        "Department of Phonetics",
        "Eötvös Loránd University",
        "MTA-ELTE Lendület Lingual Articulation Research Group",
        "Budapest",
        "Hungary",
        "Gábor Csapó",
        "Department of Telecommunications and Media Informatics",
        "Budapest University of Technology and Economics",
        "MTA-ELTE Lendület Lingual Articulation Research Group",
        "Budapest",
        "Hungary"
      ],
      "url": "https://nao-ki-mura.com/paper/autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1904.05259",
      "arxiv_url": "https://arxiv.org/abs/1904.05259",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "modality:ultrasound",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "The paper advances ultrasound silent speech interfaces by compressing ultrasound images using an autoencoder bottleneck prior to spectral parameter prediction, resulting in improved accuracy and more natural synthesized speech with smaller models.",
      "expert_take_long": "This study presents a system integration and efficiency improvement for ultrasound-based silent speech interfaces by compressing high-dimensional ultrasound tongue images via an autoencoder neural network. Instead of using the full raw pixel intensities as input to spectral parameter estimation neural networks, the authors extract bottleneck layer activations from the autoencoder as compressed features. This compression reduces redundancy and noise inherent in ultrasound images. The two-step process (autoencoder encoding and spectral DNN prediction) yields significantly better normalized mean squared error and correlation in predicting Mel-Generalized Cepstral Line Spectral Pair (MGC-LSP) parameters compared to pixel-wise inputs. Notably, this compression allows using multiple consecutive frames as input without an explosion in model size, further improving performance. Subjective listening tests demonstrate that synthesized speech from the compressed features scores higher naturalness than baseline methods. However, the study is limited to a single speaker dataset and acoustic conditions, with no investigation into session variability or real-time deployment. Overall, the work offers a practical system design advance in ultrasound SSI, emphasizing efficiency and improved acoustic prediction accuracy via latent representation learning.",
      "expert_true_value": "The key contribution lies in using an autoencoder to reduce ultrasound image redundancy and noise, enabling more compact and accurate articulatory-to-acoustic mapping, rather than novel acoustic modeling or SSI modality innovation.",
      "canon_before": "Ultrasound SSI typically used the whole ultrasound image pixel intensity frame as input features directly to deep neural network spectral parameter predictors.",
      "delta_from_canon": "Replaces direct pixel intensity input with compressed latent features extracted by an autoencoder from ultrasound frames before spectral parameter prediction. Uses activations from bottleneck layer as input features, enabling use of multiple consecutive frames without increasing model size excessively.",
      "position_in_field": "Efficiency-oriented ultrasound articulatory-to-acoustic mapping paper with latent representation learning.",
      "practical_value": "Offers a method to reduce model size and improve parameter prediction in ultrasound SSI, potentially aiding more compact and efficient ultrasound speech interfaces.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "cross-speaker and cross-session generalization, real-time deployment",
      "axes_regressed": "",
      "technical_limits": "Autoencoder architecture details limited; only single-speaker data; limited exploration of bottleneck size impact; no speaker adaptation explored; no real-time or mobile deployment shown.",
      "evaluation_limits": "Evaluation limited to a single-speaker ultrasound corpus; no cross-speaker or session testing; listening test conducted with native speakers on synthesized speech but no large-scale subjective evaluation.",
      "deployment_limits": "Requires ultrasound imaging hardware in controlled setup; no demonstration of real-time or mobile deployment; unknown robustness across speakers or sessions.",
      "scope_limits": "Focused on ultrasound tongue imaging SSI for single-speaker spectral parameter regression; excludes multi-speaker, recognition, or broader SSI types.",
      "task": "speech-reconstruction",
      "input_modality": "ultrasound video",
      "sensor_hardware": "Ultrasound imaging system with a 2-4 MHz convex array transducer producing mid-sagittal tongue images at 82 fps.",
      "body_site": "tongue",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Normalized Mean Squared Error (NMSE) for MGC-LSP prediction averaged over 25 parameters; average Pearson correlation coefficient between true and predicted spectral parameters; MUSHRA listening scores for naturalness on synthesized speech.",
      "evaluation_mode": "experimental study with quantitative regression metrics (NMSE, Pearson correlation) and subjective listening (MUSHRA) test for naturalness",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "In this study we train an autoencoder neural network on the ultrasound images; the estimation of the spectral speech parameters is done by a second DNN, using the activations of the bottleneck layer of the autoencoder as features.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces-autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces-autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-.txt",
          "section_or_location": "I. INTRODUCTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.9,
          "statement": "Normalized mean squared error scores were lower, correlation values were higher, and listening test results showed more natural sounding synthesized utterances when using autoencoder bottleneck features versus full-image baseline.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces-autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces-autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-.txt",
          "section_or_location": "IV. RESULTS USING OBJECTIVE MEASUREMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "The method was evaluated only on ultrasound recordings from one Hungarian female speaker reading 438 sentences; no cross-speaker or cross-session generalization was tested.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces-autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces-autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-.txt",
          "section_or_location": "III. EXPERIMENTAL SETUP",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.85,
          "statement": "No real-time or live deployment was demonstrated; system requires fixed ultrasound transducer setup and substantial computational resources not demonstrated as mobile or wearable.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces-autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-interfaces-autoencoder-based-articulatory-to-acoustic-mapping-for-ultrasound-silent-speech-.txt",
          "section_or_location": "VI. CONCLUSIONS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction",
      "slug": "denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction",
      "title": "Denoising convolutional autoencoder based B-mode ultrasound tongue image feature extraction",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Bo Li",
        "Kele Xu",
        "Dawei Feng",
        "Haibo Mi",
        "Huaimin Wang",
        "Jian Zhu"
      ],
      "url": "https://nao-ki-mura.com/paper/denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1903.00888",
      "arxiv_url": "https://arxiv.org/abs/1903.00888",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "evaluation:walking-tested",
        "modality:ultrasound",
        "output:text",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "DCAE provides cleaner, more robust ultrasound tongue features leading to improved silent speech recognition, outperforming prior feature extraction strategies.",
      "expert_take_long": "This paper presents a feature representation study applying denoising convolutional autoencoders (DCAE) to ultrasound tongue image sequences in the context of silent speech interfaces. The authors evaluate DCAE against previous methods such as discrete cosine transform and conventional autoencoders using both reconstruction metrics (Mean Square Error and Complex Wavelet Structural Similarity) and speech recognition performance on the 2010 Silent Speech Challenge dataset. Results indicate DCAE provides improved robustness to noise and preserves spatial tongue structure better than both traditional and deep autoencoder approaches, leading to the lowest word error rates among evaluated methods. While the contribution is modest and focused on feature extraction rather than full SSI system architecture, it establishes a useful baseline for ultrasound SSI feature compression with evidence-backed performance gains. Limitations include evaluation on a single dataset with no cross-speaker generalization and reliance on ultrasound hardware, thus deployment readiness remains medium with scope for broader validation.",
      "expert_true_value": "A well-supported demonstration that denoising convolutional autoencoders improve feature representation quality for ultrasound tongue images in silent speech tasks, offering a stronger baseline for ultrasound SSI research.",
      "canon_before": "Ultrasound tongue feature extraction primarily used direct-image representations or hand-crafted basis decomposition methods such as PCA (EigenTongue) and DCT.",
      "delta_from_canon": "Introduces an unsupervised denoising convolutional autoencoder as a feature extractor compressing noisy ultrasound frames into latent representations for downstream recognition, replacing direct or handcrafted features.",
      "position_in_field": "Feature-extraction focused contribution advancing ultrasound-based silent speech interfaces.",
      "practical_value": "Provides a method yielding cleaner latent ultrasound features that can improve downstream speech recognition in ultrasound SSI pipelines.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Generalization to broader cross-subject scenarios beyond the 2010 Silent Speech Challenge dataset",
      "axes_regressed": "",
      "technical_limits": "Single-speaker data; speckle noise and motion artifacts inherent in ultrasound; unknown cross-subject generalization.",
      "evaluation_limits": "Evaluated only on a single speaker-specific dataset (2010 silent speech challenge); no multi-speaker or cross-corpus validation provided.",
      "deployment_limits": "Need for specialized ultrasound imaging hardware; robustness to varying ultrasound systems and head movements not fully validated for real-world deployment.",
      "scope_limits": "Limited to ultrasound tongue image feature extraction; no multimodal fusion or speech synthesis explored.",
      "task": "speech-reconstruction",
      "input_modality": "ultrasound",
      "sensor_hardware": "Ultrasound imaging system with 4–8 MHz, 128-element microconvex probe",
      "body_site": "tongue",
      "output_type": "text",
      "vocabulary_type": "Not specified",
      "vocabulary_size": "Not specified",
      "metrics": "Mean Square Error (MSE), Complex Wavelet Structural Similarity Index (CW-SSIM), Word Error Rate (WER) of 6.17% (best with DCAE)",
      "evaluation_mode": "experimental study",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "The denoising convolutional autoencoder (DCAE)-based method outperforms other feature extraction methods on reconstruction and silent speech recognition tasks.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.98,
          "statement": "DCAE achieves Word Error Rate (WER) of 6.17%, better than state-of-the-art 6.45% using DCT features on the 2010 Silent Speech Challenge corpus.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "section_or_location": "3.3. Silent speech challenge",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.95,
          "statement": "Ultrasound tongue images are high-dimensional, suffer from low signal-to-noise ratio and speckle noise, and can be affected by head movement causing rotation or displacement.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "section_or_location": "1. INTRODUCTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.9,
          "statement": "The dataset used is the 2010 Silent Speech Challenge dataset with mid-sagittal ultrasound tongue image sequences recorded at 60 fps using a 4-8 MHz, 128-element microconvex probe.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "section_or_location": "3.1. Dataset",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "Evaluation is limited to a single-speaker dataset with no cross-speaker validation, raising questions about generalization.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "section_or_location": "4. CONCLUSION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.9,
          "statement": "Application of denoising convolutional autoencoder for feature extraction on ultrasound tongue images to reduce noise and preserve spatial structure better than prior AE and DCT methods.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "section_or_location": "2.4. Denoising Convolutional Auto-encoder",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.75,
          "statement": "The approach requires ultrasound imaging equipment and stabilization hardware, limiting deployment to systems where such hardware is available.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "section_or_location": "3.1. Dataset",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 0.85,
          "statement": "CAEs and DCAEs outperform traditional deep autoencoders and DCT in reconstruction metrics (MSE and CW-SSIM), indicating better preservation of spatial structure in ultrasound images.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature-extraction-denoising-convolutional-autoencoder-based-b-mode-ultrasound-tongue-image-feature.txt",
          "section_or_location": "3.2. Reconstruction error comparison",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis",
      "slug": "all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis",
      "title": "All-neural online source separation, counting, and diarization for meeting analysis",
      "year": 2019,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Thilo von Neumann",
        "Keisuke Kinoshita",
        "Marc Delcroix",
        "Shoko Araki",
        "Tomohiro Nakatani",
        "Reinhold Haeb-Umbach"
      ],
      "url": "https://nao-ki-mura.com/paper/all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1902.07881",
      "arxiv_url": "https://arxiv.org/abs/1902.07881",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "output:speech-audio"
      ],
      "expert_take_short": "Strong online diarization/separation paper, but outside SSI.",
      "expert_take_long": "This is not an SSI paper, but it is technically solid for online meeting analysis. The key idea is the block-online neural estimator that keeps speaker identity stable through silent blocks while adapting the number of output masks. The reported results are strong relative to online baselines: in the 12-block conversation-like setup, the gated proposed model reaches 11.7 dB SDR, 6.6% DER, and 4.9% SCER, with source counting above 98%. Keep it marked as adjacent diarization/separation, not silent speech.",
      "expert_true_value": "Tracking speakers through silent blocks in a single neural online system is the real contribution.",
      "canon_before": "Meeting diarization pipelines often stitched together separation and clustering and handled long silent gaps poorly in online mode.",
      "delta_from_canon": "The paper unifies separation, source counting, and diarization in one block-online neural estimator.",
      "position_in_field": "Relevant adjacent work for online separation/diarization, not for silent-speech interaction.",
      "practical_value": "Useful for long-form meeting transcription and diarization pipelines that cannot rely on offline clustering.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "The method is meeting-analysis infrastructure, not SSI, and still trails ideal masks by a wide margin.",
      "axes_regressed": "",
      "technical_limits": "Outside SSI, simulated meeting mixtures, and still well below the ideal-ratio-mask upper bound.",
      "evaluation_limits": "Meeting-analysis benchmarks only, with no SSI use case or human-interaction deployment.",
      "deployment_limits": "It targets single-channel meeting analysis rather than silent-speech interaction, and results are on simulated meeting mixtures.",
      "scope_limits": "Single-channel online meeting analysis with simulated mixtures.",
      "task": "online source separation and diarization",
      "input_modality": "single-channel meeting audio",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "In the 12-block conversation-like condition, proposed model (2) reports SDR 11.7 dB, DER 6.6%, and SCER 4.9%; source counting exceeds 98% and is above 99% in most other conditions.",
      "evaluation_mode": "SDR, DER, SCER, and source-counting accuracy across block-online meeting scenarios",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.94,
          "statement": "The abstract says the all-neural estimator performs block-online source separation, counting, and diarization while tracking speakers through silent blocks.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis-all-neural-online-source-separation-counting-and-diarization-for-meeting-analysi.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis-all-neural-online-source-separation-counting-and-diarization-for-meeting-analysi.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.9,
          "statement": "The introduction frames the method as an all-neural block-online alternative to two-stage meeting analysis pipelines that fail over long silent gaps.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis-all-neural-online-source-separation-counting-and-diarization-for-meeting-analysi.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis-all-neural-online-source-separation-counting-and-diarization-for-meeting-analysi.txt",
          "section_or_location": "1. INTRODUCTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.96,
          "statement": "Table 1 shows the gated proposed model at 11.7 dB SDR, 6.6% DER, and 4.9% SCER in the 12-block conversation-like condition, with source counting above 98%.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis-all-neural-online-source-separation-counting-and-diarization-for-meeting-analysi.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis-all-neural-online-source-separation-counting-and-diarization-for-meeting-analysi.txt",
          "section_or_location": "3.4. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.89,
          "statement": "The conclusion scopes the contribution to meeting analysis and notes comparison to realistic online baselines rather than SSI tasks.",
          "evidence_source": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis-all-neural-online-source-separation-counting-and-diarization-for-meeting-analysi.txt",
          "source_ref": "/Users/naoki/projects/nao-ki-mura/inputs/ssi_fulltext/text/ssi_all-neural-online-source-separation-counting-and-diarization-for-meeting-analysis-all-neural-online-source-separation-counting-and-diarization-for-meeting-analysi.txt",
          "section_or_location": "5. CONCLUSIONS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "kimura2019_sottovoce",
      "slug": "sottovoce",
      "title": "SottoVoce: An Ultrasound Imaging-Based Silent Speech Interaction Using Deep Neural Networks",
      "year": 2019,
      "venue": "CHI '19",
      "authors": [
        "Naoki Kimura",
        "Michinari Kono",
        "Jun Rekimoto"
      ],
      "url": "https://nao-ki-mura.com/paper/sottovoce",
      "doi": "10.1145/3290605.3300376",
      "doi_url": "https://doi.org/10.1145/3290605.3300376",
      "arxiv_id": "",
      "arxiv_url": "",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text+existing_expert_seed",
      "source_coverage": "high",
      "tags": [
        "body_site:jaw",
        "body_site:oral-cavity",
        "deployment:hands-free",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "modality:ultrasound",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "A solid proof of concept that reconstructs speech audio from ultrasound for controlling unmodified smart speakers, showcasing important system design insight despite prototype limitations in latency, hardware bulk, and speaker dependency.",
      "expert_take_long": "Kimura et al. present a well-engineered proof of concept for silent speech interaction using ultrasonic imaging and deep neural networks. Their primary contribution is a two-stage neural pipeline converting ultrasonic images captured below the jaw into Mel-spectrogram features and refining those into audio signals, which can then control unmodified commercial smart speakers like Amazon Alexa. This architectural decision to reconstruct audio, rather than directly classify commands, is a significant reframing within SSI research. The study provides modest yet concrete quantitative results: a 65% command recognition success rate with the two-network pipeline, 33.56% word error rate on Google STT, and demonstration of system use with limited command sets. They explicitly discuss limitations such as speaker dependence, system latency (~2.61 s), bulky hardware, and user adaptation requirements for silent speech without vocal fold vibration. These aspects clarify that while the prototype is not ready for real-world deployment or continuous real-time interaction, it is a strong conceptual and technical foundation for future research on SSI architectures that reuse existing voice ecosystems via speech regeneration.",
      "expert_true_value": "This paper's key value lies in reframing silent speech interaction as a speech regeneration and ecosystem reuse problem, deploying a two-stage DNN approach to produce audio from ultrasonic tongue and jaw imaging. It demonstrates practical integration with existing voice agents, highlighting a promising architectural direction distinct from direct command recognition methods.",
      "canon_before": "Most prior silent speech interfaces recognized commands directly or relied on visible cameras or other sensors, lacking integration with unchanged smart speaker ecosystems.",
      "delta_from_canon": "This work shifts SSI from direct command recognition to speech audio regeneration that can be fed to standard speech recognition engines and smart speakers without modification.",
      "position_in_field": "An early and influential demonstration of ultrasound-based speech regeneration SSI with system-level insights stronger than its present prototype performance.",
      "practical_value": "medium as a systems design contribution and research direction; low-to-medium as a direct deployable interface in reported form",
      "axes_moved": "ecosystem_integration; speech_regeneration; smart_speaker_control; user_adaptation_loop",
      "axes_unresolved": "real_time_interaction; open_vocabulary; speaker_independence; wearable_ultrasound",
      "axes_regressed": "",
      "technical_limits": "Speaker-dependent training; latency unsuitable for real-time use (2.61 s per utterance); differences in silent versus voiced articulation require user adaptation; bulky hardware; potential unknown safety issues with continuous ultrasound emission; small vocabulary size.",
      "evaluation_limits": "Only two participants were used for training and testing; the command vocabulary is small (four Alexa commands) in end-to-end testing, repeated five times each; and no speaker-independent or open vocabulary evaluations were performed.",
      "deployment_limits": "The device requires a bulky 3.5 MHz convex probe attached under the jaw and digitized display capture; continuous ultrasonic emission safety is not evaluated; the system is not wearable or miniaturized; and its latency (~2.61 s per command) is too slow for real-time use.",
      "scope_limits": "Prototype supports only a fixed small command vocabulary in speaker-dependent training; no demonstration of open vocabulary or continuous real-time interaction.",
      "task": "speech-reconstruction",
      "input_modality": "ultrasound",
      "sensor_hardware": "3.5 MHz convex ultrasound probe attached under the jaw, with ultrasound images captured to display monitor and digitized video stored",
      "body_site": "jaw; oral-cavity",
      "output_type": "speech-audio",
      "vocabulary_type": "Command-level",
      "vocabulary_size": "Approximately 500 training spoken commands per collaborator; end-to-end evaluation on four specific Alexa commands repeated five times each",
      "metrics": "Network 1 alone achieved an average 42.5% smart speaker command recognition success; Network 1 plus Network 2 achieved 65.0%; ground-truth audio reached 90.0%. Google speech-to-text word error rates were 41.03% for Network 1 outputs and 33.56% for Network 2 outputs, versus 20.61% for ground truth audio.",
      "evaluation_mode": "Quantitative smart speaker success rates, word error rates with Google speech-to-text, and qualitative user adaptation observations.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "An ultrasound-only silent-voice system that regenerates audio and controls an unchanged smart speaker with deep neural networks.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.95,
          "statement": "Network 1 plus Network 2 reached 65.0% smart-speaker command success versus 42.5% for Network 1 alone and 90.0% ground-truth audio; Google speech-to-text WER for Network 2 was 33.56%; total processing time was 2.61 s for a roughly 3.68 s command clip.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "section_or_location": "4   RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.98,
          "statement": "Approximately 500 speech commands were collected from each of two collaborators using an ultrasound probe under the jaw with paired audio, speaker-dependent training only, and end-to-end testing used four Alexa commands repeated five times each.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "section_or_location": "4   RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.97,
          "statement": "The system is speaker dependent, slow (2.61 seconds per command), limited to four end-to-end commands, uses bulky 3.5 MHz probe hardware under the jaw, and users must adapt their silent articulation for best results.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "section_or_location": "5   END-TO-END EVALUATION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.95,
          "statement": "Generated audio signals can control existing smart speakers like Amazon Echo and Echo Show without modification, demonstrated with four Alexa commands.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "section_or_location": "4   RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.9,
          "statement": "The paper's main novelty lies in system design to reconstruct speech audio from ultrasound images and use that to control unmodified smart speakers, rather than direct command recognition from ultrasound features.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "section_or_location": "3   SYSTEM ARCHITECTURE OF SOTTOVOCE",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "The system is not real-time; with 3.68 s of input, neural networks processing consumes 2.36 s and total processing time is 2.61 s, unsuitable for real-time interaction.",
          "evidence_source": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "source_ref": "inputs/ssi_fulltext/text/kimura2019_sottovoce-sottovoce-an-ultrasound-imaging-based-silent-speech-interaction-using-deep-neura.txt",
          "section_or_location": "3   SYSTEM ARCHITECTURE OF SOTTOVOCE",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold",
      "slug": "audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold",
      "title": "Audio Spectrogram Factorization for Classification of Telephony Signals below the Auditory Threshold",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Iroro Orife",
        "Shane Walker",
        "Jason Flaks"
      ],
      "url": "https://nao-ki-mura.com/paper/audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1811.04139",
      "arxiv_url": "https://arxiv.org/abs/1811.04139",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "deployment:real-time",
        "evaluation:quantitative",
        "modality:acoustic",
        "output:labels",
        "task:audio-classification"
      ],
      "expert_take_short": "Strong telephony anti-SPAM paper, not SSI.",
      "expert_take_long": "The full text supports a real deployment story: the system uses only the first two seconds of a call, factorizes the spectrogram with SVD, and chooses a 100-tree Random Forest because precision matters more than recall in a call-routing business. Table 1 gives the key number set: 83.82% precision, 63.27% recall, and 90.40% accuracy. That is meaningful for screening dead-air robocalls at scale, but nothing in the paper recovers linguistic content or builds an SSI. It should stay in the archive only as a clearly labeled distractor or adjacent audio-classification reference.",
      "expert_true_value": "This is a solid production telephony SPAM classifier for sub-audible dead-air calls, not a silent speech interface paper.",
      "canon_before": "Telephony anti-SPAM systems typically relied on signaling, metadata, or higher-energy acoustic cues rather than sub-audible dead-air classification.",
      "delta_from_canon": "The paper reframes one robocall defense problem as low-amplitude spectrogram classification within a two-second production screening window.",
      "position_in_field": "Outside SSI scope; best understood as adjacent low-amplitude audio classification rather than speech-interface research.",
      "practical_value": "High for call-center anti-SPAM operations, low for SSI method development.",
      "axes_moved": "",
      "axes_unresolved": "",
      "axes_regressed": "",
      "technical_limits": "No speech content is decoded; the method only separates telephony SPAM from HAM and depends on this specific low-amplitude robocall regime.",
      "evaluation_limits": "The evidence is from a proprietary and class-imbalanced labeled set rather than a public benchmark with reproducible splits.",
      "deployment_limits": "Useful only inside comparable VoIP anti-SPAM systems with similar latency and business costs for false positives.",
      "scope_limits": "Outside SSI scope.",
      "task": "audio classification / telephony SPAM detection",
      "input_modality": "telephony call audio",
      "sensor_hardware": "VoIP / telephony call audio captured from the production call stack",
      "body_site": "",
      "output_type": "labels",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Table 1 reports Random Forest precision 83.82%, recall 63.27%, accuracy 90.40%, outperforming the reported linear SVC baselines. The paper also notes 10,000 to 33,000 silent calls per day during traffic-pumping attacks and a strict two-second screening window.",
      "evaluation_mode": "Cross-validation on labeled telephony SPAM / HAM calls with a deployment-minded two-second latency budget before call bridging.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper proposes classifying dead-air or silent SPAM calls from features derived by factorizing the caller audio spectrogram.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold-audio-spectrogram-factorization-for-classification-of-telephony-signals-below-th.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold-audio-spectrogram-factorization-for-classification-of-telephony-signals-below-th.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The paper's main contributions include using SVD-derived audio spectrogram features and classifying SPAM from only the first two seconds of a call at production scale.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold-audio-spectrogram-factorization-for-classification-of-telephony-signals-below-th.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold-audio-spectrogram-factorization-for-classification-of-telephony-signals-below-th.txt",
          "section_or_location": "1 Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Table 1 reports Random Forest precision 83.82%, recall 63.27%, and accuracy 90.40%, better matching the business preference for high precision than the linear SVC baselines.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold-audio-spectrogram-factorization-for-classification-of-telephony-signals-below-th.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold-audio-spectrogram-factorization-for-classification-of-telephony-signals-below-th.txt",
          "section_or_location": "Table 1. Classifier performance",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "The experiments are explicitly constrained by a production requirement that two seconds is the maximum permissible latency before bridging calls, and the attack regime can add 10,000 to 33,000 silent calls per day.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold-audio-spectrogram-factorization-for-classification-of-telephony-signals-below-th.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_audio-spectrogram-factorization-for-classification-of-telephony-signals-below-the-auditory-threshold-audio-spectrogram-factorization-for-classification-of-telephony-signals-below-th.txt",
          "section_or_location": "5 Experiments",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition",
      "slug": "proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition",
      "title": "Proactive Security: Embedded AI Solution for Violent and Abusive Speech Recognition",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Christoph Dane",
        "Shulbi Bixbi Solutions",
        "Leonardo Pombal",
        "Samsung Research",
        "Vitor Jord˜ao",
        "Shulbi Bixbi Solutions",
        "Guilherme Zioll",
        "Samsung Research",
        "Bruno Martho",
        "Shulbi Bixbi Solutions",
        "Antˆonio Postal",
        "Samsung Research",
        "Thiago Prochnow",
        "Samsung Research"
      ],
      "url": "https://nao-ki-mura.com/paper/proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1810.09431",
      "arxiv_url": "https://arxiv.org/abs/1810.09431",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "medium",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "modality:microphone",
        "evaluation:structured-benchmark",
        "output:text",
        "task:speech-recognition"
      ],
      "expert_take_short": "An embedded smartphone NLP classifier detects violent speech with ~87.5% accuracy using known methods but is unrelated to silent speech interfaces; strong practical application in safety alerting.",
      "expert_take_long": "This paper addresses embedded detection of violent and abusive speech from smartphone audio using NLP-based classifiers. It employs bag-of-words and word embeddings with SVMs, augmented by SMOTE for class imbalance, on a curated 1200-sentence corpus in Brazilian Portuguese. The system runs on Android, using the native speech recognition API to convert audio to text before classification. Results show improved F1 and accuracy with embeddings (87.5%). While practical and novel as a silent alerting application, the approach is not silent speech interface technology and remains limited by dataset size, OS constraints (microphone access and persistent service), battery consumption, and language scope. Overall, this system is a focused contribution in safety monitoring via embedded NLP rather than silent speech recognition, and shows promising deployment feasibility with typical technical and dataset limitations of a PoC stage embedded classifier.",
      "expert_true_value": "Demonstration of embedded violent speech detection on smartphone with small model footprint and improved NLP classification techniques, enabling proactive silent alerts in real-time applications.",
      "canon_before": "Violent-speech detection was typically keyword-based or reliant on panic buttons requiring user action.",
      "delta_from_canon": "Replaces keyword or panic button triggers with NLP-driven phrase classification using SVM on bag-of-words and word embeddings and augments minority class with SMOTE for balance.",
      "position_in_field": "A safety-oriented embedded speech classification system outside silent speech interfaces.",
      "practical_value": "Useful embedded product concept for violent speech detection and silent alerting on smartphones, but unrelated to silent speech recognition technology.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Generalization beyond 1200-sentence corpus",
      "axes_regressed": "",
      "technical_limits": "Relies on Android SpeechRecognizer API which complicates continuous listening due to enforced beeps and OS kill; limited effectivity from small dataset and restricted language.",
      "evaluation_limits": "Evaluation limited to small 1200-phrase dataset split randomly with 70% training and 30% validation; performance metrics given only on internal validation set; no external or real-world user study reported.",
      "deployment_limits": "Relies on Android SpeechRecognizer which requires restarting, consumes significant battery (~15h continuous use), and is not fully silent due to periodic beeps; no silent or articulatory input methods; limited to Brazilian Portuguese.",
      "scope_limits": "Limited to Brazilian Portuguese violent and abusive speech phrase detection; no general speech recognition or silent speech function.",
      "task": "speech-recognition",
      "input_modality": "microphone",
      "sensor_hardware": "smartphone microphone",
      "body_site": "",
      "output_type": "text",
      "vocabulary_type": "Mixed; includes both violent and non-violent phrases; binary features for bag-of-words; numeric vector embeddings for word2vec",
      "vocabulary_size": "Not specified; vocabulary reduced by stemming and stop-word removal but no explicit size given.",
      "metrics": "Validation accuracy 79% (bag-of-words) and 87.5% (word embeddings); F1 score of 0.78 and 0.87 respectively; confusion matrices show false positive rates reduced from 26% to 6% with embeddings.",
      "evaluation_mode": "Proof-of-concept product evaluation with offline corpus split for training and validation.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "We propose an embedded artificial intelligence solution, using natural language and speech processing technology, to silently alert someone who can help in a violent situation.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.95,
          "statement": "Dataset consists of 1200 sentences with 400 positive (violent) and 800 negative (non-violent) phrases, collected from public police occurrences, threat emails, investigation data, scientific articles, localized to Brazilian Portuguese.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "section_or_location": "III. PROPOSED SOLUTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.95,
          "statement": "Bag-of-words + SVM achieved 79% validation accuracy with 0.78 F1 score; confusion matrix true positive rate 73%, false positive rate 26%, true negative rate 86%, false negative rate 14%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "section_or_location": "IV. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.95,
          "statement": "Word embeddings + SVM + SMOTE achieved 87.5% validation accuracy with 0.87 F1 score; confusion matrix true positive rate 94%, false positive rate 6%, true negative rate 78%, false negative rate 22%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "section_or_location": "IV. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "Implemented as an Android application using native SpeechRecognizer for continuous microphone input transcription, running silently in background with under 10MB model footprint; sends SMS/email with location and pictures on detecting violent speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "section_or_location": "III. PROPOSED SOLUTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "Technical limits include dependency on Android SpeechRecognizer API that produces audible beeps when listening starts, OS forcibly kills service requiring restarts causing battery drain (~15 hours under continuous use), and reliance on a relatively small and narrowly scoped dataset.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "section_or_location": "VI. FUTURE WORK",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.85,
          "statement": "The system currently works only for Brazilian Portuguese and requires expansion to other languages and larger datasets for generalization and improved accuracy.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "section_or_location": "VI. FUTURE WORK",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.9,
          "statement": "The main novelty is embedding the abusive speech classification model with a small footprint (<10MB) on a smartphone, allowing proactive silent alerting without user action, moving beyond prior keyword or panic button systems.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recognition-proactive-security-embedded-ai-solution-for-violent-and-abusive-speech-recogniti.txt",
          "section_or_location": "III. PROPOSED SOLUTION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed",
      "slug": "harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed",
      "title": "Harnessing AI for Speech Reconstruction using Multi-view Silent Video Feed",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yaman Kumar",
        "Mayank Aggarwal",
        "Pratham Nawal",
        "Shin’ichi Satoh",
        "Rajiv Ratn Shah",
        "Roger Zimmermann"
      ],
      "url": "https://nao-ki-mura.com/paper/harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed",
      "doi": "10.1145/3240508.3241911",
      "doi_url": "https://doi.org/10.1145/3240508.3241911",
      "arxiv_id": "1807.00619",
      "arxiv_url": "https://arxiv.org/abs/1807.00619",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium-high",
      "review_confidence": "medium-high",
      "review_basis": "full_text+structured_benchmark+summary",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "body_site:tongue",
        "modality:camera",
        "modality:video",
        "output:speech-audio",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "Multi-view silent video combined with CNN-LSTM models significantly improves speech audio reconstruction quality over single-view, highlighting the importance of optimal camera placement to address pose variance.",
      "expert_take_long": "This paper presents a pioneering system integrating multi-view silent video inputs for direct speech audio reconstruction using CNN-LSTM neural networks. It advances beyond prior single-view lipreading or text-based methods by generating synchronized audio, addressing pose variation with multi-angle capture. Evaluated on the OuluVS2 dataset, the method shows notable perceptual quality improvements when combining camera views spaced around 30° to 60°, validating the importance of camera placement for robustness. However, the approach remains constrained to controlled laboratory conditions, requiring multiple cameras, speaker-dependent training, and stable lighting, posing challenges for real-world deployment and generalization. The system has practical relevance for security, assistive technologies, video conferencing, and multimedia enhancement, with future work needed to expand vocabulary coverage and robustness in unconstrained environments.",
      "expert_true_value": "Introduces multi-camera synchronized video input to reconstruct intelligible and synchronized speech audio directly, overcoming pose limitations in prior lipreading works that generated text only.",
      "canon_before": "Prior lipreading and speech reconstruction works were mostly single-view and generated text transcripts rather than synchronized audio.",
      "delta_from_canon": "Introduces multi-view video input instead of single view and reconstructs synchronized speech audio directly rather than text, with consideration of camera placement to handle pose variability.",
      "position_in_field": "Early work demonstrating multi-view silent-video based audio speech reconstruction using deep learning.",
      "practical_value": "Offers practical improvements for applications needing synchronized speech audio from silent video, including assistive tech, security, and multimedia, but requires further robustness and deployment engineering.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "Generalization beyond controlled dataset (OuluVS2) unknown.",
      "axes_regressed": "",
      "technical_limits": "Requires controlled lighting, multiple camera views, speaker-dependent training; limited robustness to real-world variability.",
      "evaluation_limits": "Evaluation limited to controlled dataset (OuluVS2) with limited speakers and sessions; does not test unseen vocabulary or natural noisy environments.",
      "deployment_limits": "Requires multiple cameras placed optimally (30° to 60° apart), controlled lighting and speaker-dependent training; large-scale or real-world deployment constrained by hardware and environment challenges.",
      "scope_limits": "Focus restricted to multi-view silent-video speech reconstruction using deep learning on controlled datasets.",
      "task": "speech-reconstruction",
      "input_modality": "multi-view silent video feed",
      "sensor_hardware": "Multiple cameras recording video from 5 different angles (0°, 30°, 45°, 60°, 90°).",
      "body_site": "face; lip; tongue",
      "output_type": "speech-audio",
      "vocabulary_type": "Common English phrases and digits; no mention of open vocabulary.",
      "vocabulary_size": "Limited (phrases, digits, TIMIT sentences from OuluVS2 dataset).",
      "metrics": "Perceptual Evaluation of Speech Quality (PESQ) scores used for quantitative audio quality assessment comparing original and reconstructed audio signals.",
      "evaluation_mode": "Experimental study on OuluVS2 with quantitative metric PESQ.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "This paper presents the world's first multi-view speech reading and reconstruction system generating intelligible speech audio from silent video feeds.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.9,
          "statement": "The method uses a CNN-LSTM based model to learn mapping from silent multi-view video frames to speech audio represented with LPC features, enabling synchronized audio reconstruction.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "section_or_location": "3 METHODOLOGY",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 0.9,
          "statement": "Evaluations were conducted on the OuluVS2 dataset featuring 53 speakers recorded from 5 different angles; PESQ was used as metric to assess audio quality of reconstructed speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "section_or_location": "4 EVALUATION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.95,
          "statement": "Perceptual Evaluation of Speech Quality (PESQ) scores showed that combining camera views around 30° to 60° yields up to ~26-27% improvement over single best view in reconstructed audio quality.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "section_or_location": "4 EVALUATION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.8,
          "statement": "Optimal camera placement around 30° to 60°, and availability of multiple cameras, are essential for best speech reconstruction results; deployment requires multiple cameras and controlled conditions.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "section_or_location": "4 EVALUATION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.85,
          "statement": "Dataset is controlled with multiple views and different speakers but limited vocabulary; generalization beyond OuluVS2, speaker-independency, and real-world robustness remain unaddressed.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed-harnessing-ai-for-speech-reconstruction-using-multi-view-silent-video-feed.txt",
          "section_or_location": "6 CONCLUSIONS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_visual-only-recognition-of-normal-whispered-and-silent-speech",
      "slug": "visual-only-recognition-of-normal-whispered-and-silent-speech",
      "title": "Visual-Only Recognition of Normal, Whispered and Silent Speech",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Stavro Petridis",
        "Jie Shen",
        "Doruk Cetin",
        "Maja Pantic"
      ],
      "url": "https://nao-ki-mura.com/paper/visual-only-recognition-of-normal-whispered-and-silent-speech",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1802.06399",
      "arxiv_url": "https://arxiv.org/abs/1802.06399",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:speaker-independent",
        "evaluation:quantitative",
        "modality:video",
        "output:text",
        "task:speech-recognition"
      ],
      "expert_take_short": "Strong evidence that silent lipreading needs dedicated training.",
      "expert_take_long": "The value of the paper is empirical clarity. It records a reasonably sized three-mode database and shows that silent speech underperforms both normal and whispered speech even in matched conditions, while mismatched training makes things worse. In the digits experiment, silent matched performance is 62.2% and normal-trained testing on silent drops to 59.7%. In the phrases experiment, silent matched performance is 64.4% and normal-trained testing on silent drops to 61.2%. That makes the main lesson hard to ignore: silent visual speech should be treated as its own training regime rather than borrowed from vocalized lipreading.",
      "expert_true_value": "This is a core SSI result because the full text shows silent visual speech is a distinct regime, not just a slightly harder version of normal lipreading.",
      "canon_before": "Visual speech systems often assumed that vocalized data would transfer acceptably to whispered or silent speech.",
      "delta_from_canon": "The paper tests that assumption directly and shows silent speech is consistently worse and not well served by vocalized-only training.",
      "position_in_field": "Important visual-only SSI benchmark and transfer-analysis paper.",
      "practical_value": "High for anyone building camera-based silent speech systems or designing training data collection.",
      "axes_moved": "dataset coverage; evaluation; problem framing",
      "axes_unresolved": "open vocabulary; real-world capture; broader participant diversity",
      "axes_regressed": "",
      "technical_limits": "The task remains closed-vocabulary and laboratory-recorded, and the reported recognition rates are still far from practical open-ended silent communication.",
      "evaluation_limits": "Only digits and fixed phrases are evaluated; there is no open-vocabulary or in-the-wild test.",
      "deployment_limits": "No real deployment or live camera interface is shown.",
      "scope_limits": "Visual-only closed-vocabulary SSI recognition study.",
      "task": "speech recognition",
      "input_modality": "visual speech video across normal, whispered, and silent modes",
      "sensor_hardware": "Three cameras capturing frontal, 45-degree, and profile views at 1280x780 and 30 fps",
      "body_site": "face / lips",
      "output_type": "text",
      "vocabulary_type": "digits and short phrases",
      "vocabulary_size": "10 digits and 10 fixed short phrases",
      "metrics": "Digits matched-condition classification rates are 68.0% for normal, 70.5% for whispered, and 62.2% for silent speech. Phrases matched-condition rates are 69.7%, 70.8%, and 64.4%, while training on normal and testing on silent drops to 59.7% for digits and 61.2% for phrases.",
      "evaluation_mode": "Repeated subject-independent train / validation / test experiments for digits and phrases, with matched and mismatched train-test speech modes.",
      "evidence": [
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "The database contains 53 participants for digits and 39 participants for phrases, recorded in normal, whispered, and silent speech with frontal, 45-degree, and profile cameras.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visual-only-recognition-of-normal-whispered-and-silent-speech-visual-only-recognition-of-normal-whispered-and-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visual-only-recognition-of-normal-whispered-and-silent-speech-visual-only-recognition-of-normal-whispered-and-silent-speech.txt",
          "section_or_location": "2. DATABASE DESCRIPTION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "In the digits experiment, Table 1 reports matched-condition classification rates of 68.0% for normal, 70.5% for whispered, and 62.2% for silent speech, while normal-trained testing on silent drops to 59.7%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visual-only-recognition-of-normal-whispered-and-silent-speech-visual-only-recognition-of-normal-whispered-and-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visual-only-recognition-of-normal-whispered-and-silent-speech-visual-only-recognition-of-normal-whispered-and-silent-speech.txt",
          "section_or_location": "Table 1. Mean classification rate (and standard deviation) for the digits experiment.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "In the phrases experiment, Table 2 reports matched-condition classification rates of 69.7% for normal, 70.8% for whispered, and 64.4% for silent speech, while normal-trained testing on silent drops to 61.2%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visual-only-recognition-of-normal-whispered-and-silent-speech-visual-only-recognition-of-normal-whispered-and-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visual-only-recognition-of-normal-whispered-and-silent-speech-visual-only-recognition-of-normal-whispered-and-silent-speech.txt",
          "section_or_location": "Table 2. Mean classification rate (and standard deviation) for the phrases experiment.",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The conclusion states that using vocalized training data for a silent speech recognition system is not the best approach because performance on silent speech suffers the most.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_visual-only-recognition-of-normal-whispered-and-silent-speech-visual-only-recognition-of-normal-whispered-and-silent-speech.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_visual-only-recognition-of-normal-whispered-and-silent-speech-visual-only-recognition-of-normal-whispered-and-silent-speech.txt",
          "section_or_location": "6. CONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_cross-modal-embeddings-for-video-and-audio-retrieval",
      "slug": "cross-modal-embeddings-for-video-and-audio-retrieval",
      "title": "Cross-modal Embeddings for Video and Audio Retrieval",
      "year": 2018,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Didac Suris",
        "Amanda Duarte",
        "Amaia Salvador",
        "Jordi Torres",
        "Xavier Giro-i-Nieto"
      ],
      "url": "https://nao-ki-mura.com/paper/cross-modal-embeddings-for-video-and-audio-retrieval",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1801.02200",
      "arxiv_url": "https://arxiv.org/abs/1801.02200",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "evaluation:quantitative",
        "modality:acoustic",
        "modality:multimodal",
        "modality:video"
      ],
      "expert_take_short": "Useful multimodal retrieval baseline, not SSI.",
      "expert_take_long": "The full text supports a narrow but clear claim. The method learns a shared embedding for precomputed YouTube-8M audio and video features and evaluates retrieval in both directions. The best reported setting reaches roughly 22% Recall@1 and about 52% Recall@5 for 256 candidates, then degrades as the gallery grows. That makes this a reasonable lightweight retrieval paper, but the authors explicitly say they do not address exact cross-modal alignment, and the task is retrieval rather than silent speech recognition or reconstruction. It belongs in the archive only as an adjacent multimodal distractor.",
      "expert_true_value": "The paper is a competent audiovisual retrieval result, but it is not an SSI paper and does not decode or synthesize speech for communication.",
      "canon_before": "Cross-modal retrieval usually paired images with text or relied on narrower audiovisual domains such as music videos.",
      "delta_from_canon": "This paper uses synchronized web video to learn a lightweight joint embedding for retrieving audio from video and video from audio at YouTube-8M scale.",
      "position_in_field": "Outside SSI scope; relevant only as an adjacent multimodal representation-learning paper.",
      "practical_value": "Moderate for general audiovisual retrieval, limited for SSI.",
      "axes_moved": "",
      "axes_unresolved": "",
      "axes_regressed": "",
      "technical_limits": "The model does not solve fine temporal alignment and ignores much of the temporal structure available in the underlying videos.",
      "evaluation_limits": "Evaluation uses a 6,000-clip subset with precomputed features rather than end-to-end raw audiovisual processing or a speech-oriented benchmark.",
      "deployment_limits": "The work supports offline retrieval, not an interactive SSI system.",
      "scope_limits": "Outside SSI scope.",
      "task": "cross-modal retrieval",
      "input_modality": "video and audio",
      "sensor_hardware": "YouTube-8M precomputed audio windows and visual features sampled at 1 Hz",
      "body_site": "",
      "output_type": "retrieval ranking",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "For 256 candidates, Table 1 reports audio-to-video Recall@1/5/10 of 21.5/52.0/63.1 and Table 2 reports video-to-audio Recall@1/5/10 of 22.3/51.7/64.4. Performance drops to about 10% Recall@1 once the candidate pool grows to 1024 items.",
      "evaluation_mode": "Audio-to-video and video-to-audio retrieval on a 6,000-clip YouTube-8M subset using Recall@1, Recall@5, and Recall@10 at multiple candidate-pool sizes.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper learns a joint audio-visual embedding space so that audio can retrieve matching videos and video can retrieve matching audio.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_cross-modal-embeddings-for-video-and-audio-retrieval-cross-modal-embeddings-for-video-and-audio-retrieval.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_cross-modal-embeddings-for-video-and-audio-retrieval-cross-modal-embeddings-for-video-and-audio-retrieval.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "For 256 candidates, Table 1 reports audio-to-video Recall@1 21.5%, Recall@5 52.0%, and Recall@10 63.1%.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_cross-modal-embeddings-for-video-and-audio-retrieval-cross-modal-embeddings-for-video-and-audio-retrieval.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_cross-modal-embeddings-for-video-and-audio-retrieval-cross-modal-embeddings-for-video-and-audio-retrieval.txt",
          "section_or_location": "Table 1. Evaluation of Recall from audio to video",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "For 256 candidates, Table 2 reports video-to-audio Recall@1 22.3%, Recall@5 51.7%, and Recall@10 64.4%, with lower scores as the gallery grows to 1024 items.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_cross-modal-embeddings-for-video-and-audio-retrieval-cross-modal-embeddings-for-video-and-audio-retrieval.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_cross-modal-embeddings-for-video-and-audio-retrieval-cross-modal-embeddings-for-video-and-audio-retrieval.txt",
          "section_or_location": "Table 2. Evaluation of Recall from video to audio",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The authors explicitly state that they do not address exact alignment between the two modalities because doing so would require much higher computational effort.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_cross-modal-embeddings-for-video-and-audio-retrieval-cross-modal-embeddings-for-video-and-audio-retrieval.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_cross-modal-embeddings-for-video-and-audio-retrieval-cross-modal-embeddings-for-video-and-audio-retrieval.txt",
          "section_or_location": "1. INTRODUCTION",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video",
      "slug": "lip2audspec-speech-reconstruction-from-silent-lip-movements-video",
      "title": "Lip2AudSpec: Speech reconstruction from silent lip movements video",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Hassan Akbari",
        "Himani Arora",
        "Liangliang Cao",
        "Nima Mesgarani"
      ],
      "url": "https://nao-ki-mura.com/paper/lip2audspec-speech-reconstruction-from-silent-lip-movements-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1710.09798",
      "arxiv_url": "https://arxiv.org/abs/1710.09798",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "deployment:hands-free",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "The paper's auditory spectrogram autoencoder bottleneck target is a key innovation that produces more intelligible, natural reconstructed speech from lip videos than prior methods, as confirmed by objective and human evaluations.",
      "expert_take_long": "This paper meaningfully advances lip-to-speech reconstruction by combining a robust auditory spectrogram audio representation compressed by a noise-robust deep autoencoder with a video-driven CNN-LSTM model predicting bottleneck features to reconstruct natural-sounding speech. Experiments on the GRID dataset with 4 speakers show significant gains over the prior Vid2Speech baseline: average spectral-temporal modulation index (STMI) improved from 0.52 to 0.80, PESQ from 1.76 to 1.88, and Corr2D from 0.61 to 0.88, evidencing more accurate acoustic reconstructions. Additionally, a Mechanical Turk human transcription evaluation found a 5% absolute word accuracy improvement (51% to 56%) and a striking correct speaker gender classification increase (43% to 85%), demonstrating better preservation of pitch and speaker traits. Nonetheless, the approach remains limited to a small closed vocabulary and speaker pool. The method depends solely on lip video, thus missing articulatory cues from tongue or throat that particularly affect vowel and high-frequency speech reconstruction. Despite these limitations, the paper sets a valuable benchmark focusing on the importance of speech representation and an audio-visual pipeline design for better intelligibility in lip-based speech reconstruction.",
      "expert_true_value": "Demonstrates that leveraging a deep compressed auditory spectrogram representation as a reconstruction target materially improves intelligibility and pitch preservation in lip-to-speech systems over prior spectrogram or LPC-based methods on a well-known benchmark.",
      "canon_before": "Prior lip-to-speech systems like Vid2Speech reconstructed speech but suffered weak pitch and quality due to target representations missing excitation parameters.",
      "delta_from_canon": "Shifts from using classical LPC or spectrogram targets to auditory spectrogram compressed by a deep autoencoder for better speaker and pitch preservation; evaluates with human transcription and objective metrics.",
      "position_in_field": "Strong lip-to-speech reconstruction baseline focused on improved acoustic targets rather than solely on visual feature encoders.",
      "practical_value": "Useful as a reference lip-to-speech system that highlights the impact of appropriate speech representation on intelligibility and speaker characteristics preservation.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "open-vocabulary reconstruction, speaker independence beyond GRID, richer articulation features beyond lips",
      "axes_regressed": "",
      "technical_limits": "Lip-only video input misses tongue and throat information affecting some vowels and high-frequency speech; only evaluated on limited closed vocabulary and speaker set.",
      "evaluation_limits": "Evaluations are on the 4-speaker GRID corpus only; human intelligibility measured via Mechanical Turk limited to vocabulary and speakers in dataset.",
      "deployment_limits": "Trained and evaluated strictly on 4 GRID speakers with controlled vocabulary; lacks real-world recordings and speaker variability; lip-only input misses tongue/throat cues affecting vowel/high-frequency fidelity.",
      "scope_limits": "Closed vocabulary lip-to-speech reconstruction on GRID corpus videos of 4 speakers only.",
      "task": "speech-reconstruction",
      "input_modality": "silent lip video",
      "sensor_hardware": "camera",
      "body_site": "lip",
      "output_type": "speech-audio",
      "vocabulary_type": "closed vocabulary",
      "vocabulary_size": "51 words",
      "metrics": "Average over four speakers: STMI 0.80 vs 0.52 baseline, PESQ 1.88 vs 1.76 baseline, Corr2D 0.88 vs 0.61 baseline; human word accuracy 55.8% vs 50.9%, correct gender 85.1% vs 43.2%.",
      "evaluation_mode": "Quantitative objective metrics (Corr2D, PESQ, STMI) plus human transcription and quality/naturalness/female/male recognition surveys on Mechanical Turk.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "Introduces a deep neural network that reconstructs intelligible speech from silent lip movement videos using auditory spectrogram targets and compares it to Vid2Speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The model first compresses auditory spectrograms through an autoencoder and then predicts those bottleneck features from silent lip video using CNN, LSTM, and fully connected layers.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "section_or_location": "3.2 Network I",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Average STMI improved from 0.52 to 0.80, PESQ improved from 1.76 to 1.88, Corr2D improved from 0.61 to 0.88 compared to Vid2Speech across four GRID corpus speakers.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "section_or_location": "4.3.3 Lip",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Human evaluation showed average word accuracy improved from 50.9% to 55.8%, and correct gender recognition improved from 43.2% to 85.1% over Vid2Speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "section_or_location": "4.3.4 Human evaluations",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Lip-only video input misses tongue and throat information, making it difficult to reconstruct high frequency details and different vowels accurately; limited to 4 speakers and closed vocabulary in GRID corpus.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "section_or_location": "1 Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "The network is trained and tested on a small closed-vocabulary dataset of 4 GRID corpus speakers; lacking real-world generalization and speaker variability for direct practical deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_lip2audspec-speech-reconstruction-from-silent-lip-movements-video-lip2audspec-speech-reconstruction-from-silent-lip-movements-video.txt",
          "section_or_location": "4.1 Dataset",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning",
      "slug": "updating-the-silent-speech-challenge-benchmark-with-deep-learning",
      "title": "Updating the silent speech challenge benchmark with deep learning",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Yan Ji",
        "Licheng Liu",
        "Hongcui Wang",
        "Zhilei Liu",
        "Zhibin Niu",
        "Bruce Denby"
      ],
      "url": "https://nao-ki-mura.com/paper/updating-the-silent-speech-challenge-benchmark-with-deep-learning",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1709.06818",
      "arxiv_url": "https://arxiv.org/abs/1709.06818",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:lip",
        "body_site:tongue",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:ultrasound",
        "modality:video",
        "output:text",
        "task:speech-recognition"
      ],
      "expert_take_short": "Benchmark update with a real, reproducible WER gain.",
      "expert_take_long": "The strongest part of the paper is methodological discipline. It does not introduce a new sensing modality; it keeps the Silent Speech Challenge framing intact long enough to ask what deep learning changes. Table 2 is the headline: the original 17.4% WER benchmark remains 17.4% under a Kaldi GMM-HMM reproduction, then drops to 6.45% with a DNN-HMM. Table 3 also shows that language-model choice matters, with DNN WER moving from 11.44% under the WSJ LM to 6.45% under the task-specific CSR LM. That makes this a benchmark-updating paper with genuine value, but still within a single-speaker controlled archive rather than a broadly deployable SSI.",
      "expert_true_value": "This is a benchmark-anchored SSI paper with a real methodological payoff: on the same archive and decoding framing, DNN-HMM recognition almost triples performance over the original benchmark value.",
      "canon_before": "The Silent Speech Challenge was a fixed ultrasound-plus-lip benchmark whose published reference point was 17.4% WER with an HTK GMM-HMM system.",
      "delta_from_canon": "The paper reruns the benchmark with matched decoding in Kaldi, then shows a DNN-HMM can cut WER to 6.45% and that language-model choice matters materially.",
      "position_in_field": "Canonical benchmark update for ultrasound-plus-lip silent speech recognition.",
      "practical_value": "High for benchmarking and for understanding what part of the gain comes from model class versus feature or language-model choice.",
      "axes_moved": "benchmark reproducibility; evaluation; modeling baseline",
      "axes_unresolved": "speaker independence; larger corpora; live deployment",
      "axes_regressed": "",
      "technical_limits": "Everything remains single-speaker, controlled, and benchmark-bounded; there is no evidence of speaker independence or real-world robustness.",
      "evaluation_limits": "The archive is small and single-speaker, so large gains on this benchmark do not automatically transfer to broader SSI deployment.",
      "deployment_limits": "No live deployment, calibration burden analysis, or user-facing interface is reported.",
      "scope_limits": "Speaker-dependent ultrasound-plus-lip benchmark study.",
      "task": "speech recognition",
      "input_modality": "ultrasound tongue imaging and lip video",
      "sensor_hardware": "Ultrasound transducer placed under the chin plus a small lip video camera",
      "body_site": "tongue / lips",
      "output_type": "text",
      "vocabulary_type": "continuous-speech benchmark vocabulary",
      "vocabulary_size": "WSJ0 5000-word lexicon with task-specific and WSJ language-model settings",
      "metrics": "Table 2 reports 17.4% WER for the original HTK benchmark, 17.4% for a Kaldi GMM-HMM reproduction, and 6.45% for the Kaldi DNN-HMM with 30-element DCT features. Table 3 shows DNN WER of 11.44% with the WSJ LM and 6.45% with the task-specific CSR LM.",
      "evaluation_mode": "Direct comparison to the original Silent Speech Challenge benchmark plus language-model and feature sweeps on the same single-speaker archive.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The abstract says the 2010 Silent Speech Challenge benchmark is updated with deep-learning results and that a best word error rate of 6.4% is obtained compared with the published 17.4% value.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning-updating-the-silent-speech-challenge-benchmark-with-deep-learning.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning-updating-the-silent-speech-challenge-benchmark-with-deep-learning.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "The Silent Speech Challenge benchmark is a single-speaker archive built from ultrasound and lip video recorded from one native English speaker with an independent test corpus.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning-updating-the-silent-speech-challenge-benchmark-with-deep-learning.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning-updating-the-silent-speech-challenge-benchmark-with-deep-learning.txt",
          "section_or_location": "1.2. The Silent Speech Challenge benchmark",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Table 2 reports 17.4% WER for the original HTK benchmark, 17.4% for the Kaldi GMM-HMM reproduction, and 6.45% for the Kaldi DNN-HMM using 30-element DCT features.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning-updating-the-silent-speech-challenge-benchmark-with-deep-learning.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning-updating-the-silent-speech-challenge-benchmark-with-deep-learning.txt",
          "section_or_location": "Table 2. Comparison with original HTK result of [52], using 30-element DCT features",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Table 3 shows that with 30-element DCT features the DNN system gets 11.44% WER under the WSJ language model and 6.45% under the task-specific CSR language model.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning-updating-the-silent-speech-challenge-benchmark-with-deep-learning.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_updating-the-silent-speech-challenge-benchmark-with-deep-learning-updating-the-silent-speech-challenge-benchmark-with-deep-learning.txt",
          "section_or_location": "Table 3. Comparing results for the 2 different LM, for 30-element feature vectors of both types",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement",
      "slug": "seeing-through-noise-visually-driven-speaker-separation-and-enhancement",
      "title": "Seeing Through Noise: Visually Driven Speaker Separation and Enhancement",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Aviv Gabbay",
        "Ariel Ephrat",
        "Tavi Halperin",
        "Shmuel Peleg"
      ],
      "url": "https://nao-ki-mura.com/paper/seeing-through-noise-visually-driven-speaker-separation-and-enhancement",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1708.06767",
      "arxiv_url": "https://arxiv.org/abs/1708.06767",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "modality:acoustic",
        "modality:video",
        "output:speech-audio",
        "task:speech-enhancement"
      ],
      "expert_take_short": "Strong audiovisual speech separation and enhancement leveraging face video for speaker-dependent masking; not a silent speech interface paper.",
      "expert_take_long": "This paper presents a visually driven speech separation and enhancement system that uses speaker-dependent video-to-speech network predictions as spectrogram priors to build masking filters over mixed audio input. Experiments on synthetic two-speaker mixtures from GRID and TCD-TIMIT show the approach significantly outperforms audio-only baselines and raw video-to-speech outputs, with ratio masking achieving GRID SDR 5.62 and PESQ 2.6 (vs. audio-only 1.74/1.85), and TCD-TIMIT SDR 8.68 and PESQ 2.71 (vs. audio-only 2.91/2.16). Enhancement experiments similarly surpass raw predictions. An unknown-speaker transfer experiment on GRID demonstrates reduced but viable performance after limited fine-tuning (SDR 3.06, PESQ 2.42). The approach leverages visual information to overcome challenges in audio-only same-gender speaker separation but still requires known faces and training per speaker, limiting direct application in silent speech interfaces. Although effective for audiovisual separation and enhancement in controlled datasets, the method has medium-high deployment readiness gaps due to reliance on speaker-dependent models, lack of comprehensive real-world noisy testing, and limited zero-shot generalization.",
      "expert_true_value": "Demonstrates that visual speech predictions can effectively guide mask construction for separating or enhancing a known target speaker from noisy audio mixtures, notably improving on audio-only separation especially for same-gender mixtures.",
      "canon_before": "Audio-only speech separation methods struggled especially on same-gender mixtures, and prior video-to-speech methods generated speech directly without using them as separation priors.",
      "delta_from_canon": "Reframes vid2speech predictions as an intermediate prior for constructing masks for speech separation and enhancement, rather than as final speech output.",
      "position_in_field": "Strong 2017 audiovisual speech separation paper that effectively uses visual priors for masking, adjacent to but distinct from silent speech interface research.",
      "practical_value": "Useful in scenarios requiring isolation of a visible speaker's voice from mixtures or background noise using audiovisual data.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "real-world multi-speaker video; zero-shot unknown speakers; end-to-end training",
      "axes_regressed": "",
      "technical_limits": "Limited to speaker-dependent models; unknown-speaker separation requires fine-tuning with audiovisual samples; evaluated mostly on synthetic mixtures; real-world noisy environment performance not fully benchmarked.",
      "evaluation_limits": "Evaluations use synthetic benchmark mixtures (GRID and TCD-TIMIT) with known speakers; unknown speaker separation evaluated with 5-min fine-tuning; no large-scale real-world noisy environment benchmarks.",
      "deployment_limits": "Requires visible speaker face video and speaker-dependent model training; unknown speaker separation requires fine-tuning with audiovisual data.",
      "scope_limits": "Audiovisual speaker separation and enhancement tested on synthetic GRID and TCD-TIMIT two-speaker mixtures and limited unknown speaker transfer on GRID.",
      "task": "speech-enhancement",
      "input_modality": "audio plus face video",
      "sensor_hardware": "camera + microphone",
      "body_site": "face",
      "output_type": "speech-audio",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Table 1 reports GRID ratio-mask separation SDR 5.62 and PESQ 2.6 vs. audio-only 1.74 and 1.85; TCD-TIMIT ratio-mask SDR 8.68 and PESQ 2.71 vs. audio-only 2.91 and 2.16. Table 3 shows unknown-speaker GRID fine-tuned separation SDR 3.06 and PESQ 2.42.",
      "evaluation_mode": "Speech separation and enhancement evaluated with SDR, SIR, SAR, PESQ metrics, plus unknown-speaker transfer test.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "Uses face video to separate or enhance a target speaker from noisy mixtures by applying video-to-speech neural network predictions as filters on the mixed audio. Compares results to audio-only baselines and raw vid2speech predictions.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The system predicts mel-scale speech spectrograms from the speaker's face video using a vid2speech neural network and uses these spectrograms as control signals to construct time-frequency masks (binary or ratio) to separate or enhance the target speaker from the audio mixture.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "section_or_location": "2. VISUALLY",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Table 1 reports GRID ratio-mask separation SDR 5.62 and PESQ 2.6 vs. audio-only 1.74 and 1.85; TCD-TIMIT ratio-mask SDR 8.68 and PESQ 2.71 vs. audio-only 2.91 and 2.16. Table 3 shows unknown-speaker GRID fine-tuned separation SDR 3.06 and PESQ 2.42.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "section_or_location": "4.3. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluations use synthetic benchmark mixtures (GRID and TCD-TIMIT) with known speakers; unknown speaker separation evaluated with 5-min fine-tuning; no large-scale real-world noisy environment benchmarks.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "section_or_location": "4. EXPERIMENTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "Requires visible speaker face video and speaker-dependent model training; unknown speaker separation requires fine-tuning with audiovisual data.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "section_or_location": "4.3. Results",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 1,
          "statement": "This is an audiovisual separation system, not a silent speech interface. Unknown speaker separation still requires fine-tuning and visible face video, limiting immediate deployment.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_seeing-through-noise-visually-driven-speaker-separation-and-enhancement-seeing-through-noise-visually-driven-speaker-separation-and-enhancement.txt",
          "section_or_location": "5. CONCLUDING REMARKS",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_improved-speech-reconstruction-from-silent-video",
      "slug": "improved-speech-reconstruction-from-silent-video",
      "title": "Improved Speech Reconstruction from Silent Video",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ariel Ephrat",
        "Tavi Halperin",
        "Shmuel Peleg"
      ],
      "url": "https://nao-ki-mura.com/paper/improved-speech-reconstruction-from-silent-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1708.01204",
      "arxiv_url": "https://arxiv.org/abs/1708.01204",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "deployment:hands-free",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:structured-benchmark",
        "evaluation:unseen-words",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "Strong, benchmark-setting speaker-dependent video-to-speech system that advances speech reconstruction from silent face video but remains limited to per-speaker training and constrained conditions.",
      "expert_take_long": "This paper demonstrates a solid early approach to direct speech reconstruction from silent full-face video using a two-stream ResNet encoder (pixels and optical flow) feeding a decoder and a Tacotron-inspired postnet to produce smooth, natural speech spectrogram reconstructions. Tested on GRID and TCD-TIMIT lipspeaker datasets, it achieves significant improvements in objective intelligibility (STOI up to 0.68 on GRID speaker S3) and subjective word accuracy (55.8% vs 50.9% over Vid2Speech). The method shows promising but limited success on the unconstrained vocabulary of TCD-TIMIT, where intelligibility is notably lower. Its main limitation remains explicit speaker-dependence, requiring separate training per speaker and lacking validation on unseen or in-the-wild speakers. Overall, it sets a benchmark for speaker-dependent video-to-speech systems using end-to-end CNN architectures but leaves open questions about generalization and deployment.",
      "expert_true_value": "The paper’s key value lies in successfully demonstrating that a dual-input CNN model combining full-face video pixels and optical flow with a temporal post-processing network can reconstruct intelligible and more natural speech audio from silent video, surpassing prior methods in objective and human evaluation, and extending progress towards unconstrained vocabulary speech reconstruction, though speaker-independence and real-world usage remain unresolved.",
      "canon_before": "Video-to-speech speechreading methods mostly treated as visual-to-text classification with limited vocabularies; voxel-to-audio regression methods existed but yielded robotic or unintelligible speech due to limited network architectures and loss functions.",
      "delta_from_canon": "Introduces a dual-stream ResNet encoder with pixels plus optical flow, a post-processing CBHG network for temporal refinement, and regression to mel and linear spectrogram audio features enabling smoother and more natural speech reconstruction, as well as evaluation beyond constrained vocabularies.",
      "position_in_field": "Early strong baseline for visually-driven speech reconstruction; notable for integrating optical flow and postnet to improve audio naturalness and intelligibility.",
      "practical_value": "Provides a reference architecture and baseline benchmarks for speaker-dependent video-to-speech synthesis and audio reconstruction from silent face video for controlled settings and lab conditions.",
      "axes_moved": "system_design; evaluation",
      "axes_unresolved": "speaker-independent reconstruction; real-time processing; in-the-wild capture; unconstrained vocabulary full intelligibility",
      "axes_regressed": "",
      "technical_limits": "Speaker-dependent modeling requiring per-speaker training; unconstrained vocabularies only partially intelligible; audio quality still limited; no demonstration of real-time or unseen speaker generalization.",
      "evaluation_limits": "Evaluation is offline using GRID and TCD-TIMIT datasets, with no unseen speaker evaluation beyond lipspeaker subsets and no real-time or in-the-wild validation; objective metrics (STOI, ESTOI, PESQ, ViSQOL) and Mechanical Turk intelligibility tests on limited vocabulary datasets.",
      "deployment_limits": "Speaker-dependent model requiring per-speaker training on cropped, registered full-face video; no demonstrated use on unknown speakers or unconstrained real-world conditions; no real-time deployment shown.",
      "scope_limits": "Speaker-dependent speech reconstruction from full-face silent video; two benchmark datasets GRID and TCD-TIMIT Lipspeakers; no unseen speaker or real-world environment tested.",
      "task": "speech-reconstruction",
      "input_modality": "silent full-face video frames plus dense optical flow derived from consecutive frames",
      "sensor_hardware": "camera",
      "body_site": "face",
      "output_type": "speech-audio",
      "vocabulary_type": "mixed benchmark vocabulary",
      "vocabulary_size": "51-word GRID grammar plus many unseen words in TCD-TIMIT testing",
      "metrics": "GRID S3 STOI of 0.68, ESTOI 0.398, PESQ 1.974, ViSQOL 3.349; TCD-TIMIT Lipspeaker 3 STOI 0.63, ESTOI 0.447, PESQ 1.612; Mechanical Turk word accuracy improved from 50.9% to 55.8% over previous Vid2Speech method.",
      "evaluation_mode": "Objective speech quality and intelligibility metrics plus human intelligibility study and ablation experiments.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "We present an end-to-end model based on a convolutional neural network (CNN) for generating an intelligible and natural-sounding acoustic speech signal from silent video frames of a speaking person, trained and evaluated on GRID and TCD-TIMIT datasets with significant quality improvements over prior methods.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The proposed model includes a dual-tower Residual neural network (ResNet) encoder processing grayscale face images and optical flow separately, whose embeddings are concatenated and fed to a decoder and a post-processing CBHG network inspired by Tacotron to produce mel and linear spectrograms representing speech.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "4. Model architecture",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Experiments are performed on GRID dataset speakers S1-S4 using 80/20 train/test splits and on TCD-TIMIT lipspeakers 1-3, showing quantitative improvements on objective intelligibility metrics (STOI, ESTOI) and speech quality metrics (PESQ, ViSQOL).",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "6. Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "On GRID test speaker S3 the model achieves STOI 0.68, ESTOI 0.398, PESQ 1.974, ViSQOL 3.349 when reconstructing speech; on TCD-TIMIT lipspeaker 3 it achieves STOI 0.63, ESTOI 0.447, PESQ 1.612, with many test words outside training.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "6. Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The method is explicitly speaker-dependent requiring per-speaker training; the TCD-TIMIT dataset unconstrained vocabulary setting results in less intelligible reconstructed speech; no unseen speaker or real-time deployment evaluation was conducted.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "4. Model architecture",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "deployment_claim",
          "confidence": 0.9,
          "statement": "The model requires individual per-speaker training on registered full-face video frames, limiting real-world deployment; speaker independence and in-the-wild robustness are not addressed.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "4. Model architecture",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The approach models speechreading as a direct regression from silent video frames to speech spectrogram features instead of a classification approach, allowing reconstruction of speech from unconstrained word dictionaries.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "Introduction",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 1,
          "statement": "The method improves prior video-to-speech work by fusing raw pixels and optical flow features within a dual-stream CNN encoder, replacing VGG with ResNet, and employing a Tacotron-based post-processing network to enhance speech naturalness and smoothness.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "4. Model architecture",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "Evaluation includes objective intelligibility and speech quality metrics, human intelligibility experiments (Mechanical Turk), and ablations of optical flow and postnet components, demonstrating pixels carry most information but flow and postnet improve results.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "6. Experiments",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The system reconstructs multi-frame speech audio by jointly predicting mel-scale spectrograms over multiple consecutive video frames, improving smoothness over prior single-frame prediction methods.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_improved-speech-reconstruction-from-silent-video-improved-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "4. Model architecture",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_vid2speech-speech-reconstruction-from-silent-video",
      "slug": "vid2speech-speech-reconstruction-from-silent-video",
      "title": "Vid2speech: Speech Reconstruction from Silent Video",
      "year": 2017,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Ariel Ephrat",
        "Shmuel Peleg"
      ],
      "url": "https://nao-ki-mura.com/paper/vid2speech-speech-reconstruction-from-silent-video",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1701.00495",
      "arxiv_url": "https://arxiv.org/abs/1701.00495",
      "review_state": "expert_fulltext_draft",
      "review_priority": "high",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:face",
        "body_site:lip",
        "deployment:speaker-dependent",
        "evaluation:quantitative",
        "evaluation:unseen-words",
        "modality:video",
        "output:speech-audio",
        "task:speech-reconstruction"
      ],
      "expert_take_short": "Real lip-to-speech progress, still tightly benchmark-bounded.",
      "expert_take_long": "The full text backs a stronger claim than the abstract-only version. On the constrained GRID setup, reconstructed speech becomes much more intelligible than prior work: Table 2 shows 82.6% audio-only intelligibility on S4 and about 80% audio-visual intelligibility on S4 and S2, versus 40.0% and 51.9% in the cited baseline. The OOV experiment is also real rather than decorative: Table 3 reports 51.6% accuracy when two digits are held out of training, far above the 10% chance rate. The limits are equally clear: speaker dependence, LPC-like synthesis artifacts, and a fixed 51-word grammar keep this from being a practical open-world SSI.",
      "expert_true_value": "The paper is an early but real lip-to-speech milestone: within GRID, silent video can drive intelligible reconstructed speech and even partial OOV recovery.",
      "canon_before": "Most visual silent-speech work focused on recognition rather than direct speech reconstruction, and earlier reconstruction quality was substantially lower.",
      "delta_from_canon": "The paper models automatic speechreading as regression to acoustic features and shows materially improved human intelligibility, including a held-out-digit OOV test.",
      "position_in_field": "Important early visual speech-reconstruction paper in SSI-adjacent lip-to-speech research.",
      "practical_value": "High as a proof of concept for lip-to-speech generation, though still too constrained for open deployment.",
      "axes_moved": "problem framing; evaluation; speech reconstruction",
      "axes_unresolved": "speaker independence; naturalness; unconstrained vocabulary and capture",
      "axes_regressed": "",
      "technical_limits": "The system depends on speaker-specific training, a constrained GRID grammar, and an LPC-style resynthesis path that still sounds unnatural.",
      "evaluation_limits": "Evidence is limited to GRID and a relatively small listening study rather than an open-vocabulary or in-the-wild evaluation.",
      "deployment_limits": "No live camera-to-audio system or user-facing deployment is shown.",
      "scope_limits": "Constrained lip-to-speech reconstruction study.",
      "task": "speech-reconstruction",
      "input_modality": "silent face video",
      "sensor_hardware": "25 FPS, 720x576 video from the GRID audiovisual corpus",
      "body_site": "face / lips",
      "output_type": "speech audio",
      "vocabulary_type": "GRID sentence grammar",
      "vocabulary_size": "51 words",
      "metrics": "Table 2 reports 82.6% audio-only intelligibility for S4 and 79.9% / 79.0% audio-visual intelligibility for S4 / S2, compared with 40.0% audio-only and 51.9% audio-visual in prior work [10]. Table 3 reports 51.6% OOV audio-visual intelligibility versus 10.0% chance and 93.4% when no digits are held out.",
      "evaluation_mode": "Human intelligibility studies on reconstructed audio-only, audio-visual, and out-of-vocabulary settings using MTurk listeners.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The paper presents an end-to-end CNN-based model that generates an intelligible audio signal directly from silent video of a speaking person and shows promising results for out-of-vocabulary words.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vid2speech-speech-reconstruction-from-silent-video-vid2speech-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vid2speech-speech-reconstruction-from-silent-video-vid2speech-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "validation_scope",
          "confidence": 1,
          "statement": "The main reconstruction experiment uses the 1000 videos of speaker S4 from GRID, while the corpus as described here contains 51 words and fixed 3-second clips.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vid2speech-speech-reconstruction-from-silent-video-vid2speech-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vid2speech-speech-reconstruction-from-silent-video-vid2speech-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "4.2. Sound prediction tasks",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Table 2 reports 82.6% audio-only intelligibility for speaker S4 and 79.9% / 79.0% audio-visual intelligibility for speakers S4 / S2, versus 40.0% audio-only and 51.9% audio-visual in prior work [10].",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vid2speech-speech-reconstruction-from-silent-video-vid2speech-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vid2speech-speech-reconstruction-from-silent-video-vid2speech-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "Table 2. Our reconstructed speech is significantly more intelligible than the results of [10].",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Table 3 reports 51.6% OOV audio-visual intelligibility, compared with 93.4% when no digits are held out and 10.0% chance.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_vid2speech-speech-reconstruction-from-silent-video-vid2speech-speech-reconstruction-from-silent-video.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_vid2speech-speech-reconstruction-from-silent-video-vid2speech-speech-reconstruction-from-silent-video.txt",
          "section_or_location": "Table 3. Out-of-vocabulary (OOV) intelligibility results.",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences",
      "slug": "contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences",
      "title": "Contour-based 3d tongue motion visualization using ultrasound image sequences",
      "year": 2016,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Kele Xu",
        "Yin Yang",
        "Clemence Leboullenger",
        "Pierre Roussel",
        "Bruce Denby"
      ],
      "url": "https://nao-ki-mura.com/paper/contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences",
      "doi": "",
      "doi_url": "",
      "arxiv_id": "1605.05967",
      "arxiv_url": "https://arxiv.org/abs/1605.05967",
      "review_state": "expert_fulltext_draft",
      "review_priority": "medium",
      "review_confidence": "high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [
        "body_site:tongue",
        "evaluation:quantitative",
        "modality:ultrasound"
      ],
      "expert_take_short": "Useful tongue-modeling tool, not a recognizer.",
      "expert_take_long": "The full text supports a more careful reading than the seed version. The method is technically specific: contour extraction from ultrasound drives a finite-element tongue model through modal reduction and modal warping, and the runtime system searches a 1000-shape database using contour similarity. Section 5 reports about 1.2 seconds per frame association on desktop hardware, which is better than a purely offline toy demo but still not a proven real-time SSI. The authors are also explicit that quantitative evaluation is missing and that midsagittal-only information cannot capture out-of-plane tongue motion. So this is valuable adjacent modeling work, but not a speech-decoding paper.",
      "expert_true_value": "This is an SSI-adjacent articulatory visualization paper, not a speech recognizer, but it offers a concrete modeling pipeline for turning ultrasound contours into 3D tongue motion.",
      "canon_before": "Ultrasound tongue work often focused on contour tracking or 2D visualization rather than a contour-driven 3D deformation model.",
      "delta_from_canon": "The paper links ultrasound contour extraction to a 3D finite-element tongue model via modal reduction, modal warping, and contour-matching over a precomputed shape database.",
      "position_in_field": "Adjacent tongue-modeling and visualization work that can support SSI interpretation and articulation analysis.",
      "practical_value": "Moderate for articulatory analysis and future SSI tooling, limited as a direct communication interface.",
      "axes_moved": "articulatory modeling; visualization; problem framing",
      "axes_unresolved": "quantitative validation; out-of-plane motion; end-to-end interactive performance",
      "axes_regressed": "",
      "technical_limits": "The model uses only midsagittal information, relies on a simplified four-node driving scheme, and lacks a quantitative accuracy benchmark.",
      "evaluation_limits": "Validation is qualitative and runtime-oriented; no MRI- or EMA-backed quantitative benchmark is reported.",
      "deployment_limits": "This is not a communication interface and does not show user-facing SSI deployment.",
      "scope_limits": "SSI-adjacent visualization and modeling paper, not speech recognition.",
      "task": "tongue motion visualization",
      "input_modality": "B-mode ultrasound tongue images",
      "sensor_hardware": "B-mode ultrasound imaging with contour extraction from the midsagittal tongue surface",
      "body_site": "tongue",
      "output_type": "3D tongue visualization",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "The database contains 1000 sample 3D tongue shapes, and Section 5 reports about 1.2 seconds on average to build the association between the current ultrasound frame and the 3D tongue model on the reported desktop hardware. The paper explicitly says no effective quantitative evaluation method is yet available.",
      "evaluation_mode": "Qualitative visualization with runtime reporting and midsagittal overlay validation against ultrasound contours.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 1,
          "statement": "The abstract describes a contour-based 3D tongue deformation visualization framework that uses B-mode ultrasound sequences, a generic finite-element tongue model, modal reduction, and modal warping.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences-contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences-contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences.txt",
          "section_or_location": "ABSTRACT",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "fact",
          "confidence": 1,
          "statement": "The method builds a 1000-shape 3D tongue database and selects the tongue shape whose projected contour best matches the contour extracted from the current ultrasound frame.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences-contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences-contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences.txt",
          "section_or_location": "4. CONTOUR-BASED 3D TONGUE MOTION VISUALIZATION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 1,
          "statement": "Section 5 reports an average processing time of about 1.2 seconds to build the association between the current ultrasound frame and the 3D tongue model on the reported desktop platform.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences-contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences-contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences.txt",
          "section_or_location": "5. RESULTS",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 1,
          "statement": "The discussion says there is still no effective quantitative evaluation method, midsagittal-only motion is insufficient for full tongue accuracy, and future validation should use modalities such as MRI and EMA.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences-contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences-contour-based-3d-tongue-motion-visualization-using-ultrasound-image-sequences.txt",
          "section_or_location": "6. DISCUSSION AND FUTURE WORK",
          "evidence_kind": "full_text"
        }
      ]
    },
    {
      "paper_id": "ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint",
      "slug": "optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint",
      "title": "Optimal Power Control for Analog Bidirectional Relaying with Long-Term Relay Power Constraint",
      "year": 2014,
      "venue": "arXiv / imported corpus page",
      "authors": [
        "Zoran Hadzi-Velkov",
        "Nikola Zlatanov",
        "Robert Schober"
      ],
      "url": "https://nao-ki-mura.com/paper/optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint",
      "doi": "10.1109/GLOCOM.2013.6831710",
      "doi_url": "https://doi.org/10.1109/GLOCOM.2013.6831710",
      "arxiv_id": "1404.0906",
      "arxiv_url": "https://arxiv.org/abs/1404.0906",
      "review_state": "expert_fulltext_draft",
      "review_priority": "low",
      "review_confidence": "medium-high",
      "review_basis": "full_text",
      "source_coverage": "high",
      "tags": [],
      "expert_take_short": "A rigorous relay power control theory paper optimizing outage under long-term average power constraints for bidirectional AF relaying; solid mathematical contribution but outside SSI relevance.",
      "expert_take_long": "This paper presents a solid communications-theoretic contribution by deriving the optimal power control at the relay for dual-hop bidirectional amplify-and-forward relaying under a long-term average relay power constraint. The solution, formalized in Theorem 1 and its proof, shows that the relay should transmit at the minimum short-term power that avoids outage when it does not exceed a cutoff threshold ρ, and remain silent otherwise. This approach elegantly extends concepts of truncated channel inversion from single links to bidirectional relays. Numerical examples demonstrate significant outage performance improvements and relay power savings compared to fixed power and short-term constrained power allocations. However, the technical scope remains strictly within wireless relay communications theory with idealized fading channel models and does not address silent speech interfaces or SSI systems. The brief mention of speech/video traffic in the motivation does not alter this fundamental scope limitation; thus, the paper belongs outside the SSI domain but remains a coherent, valuable theoretical reference for relay power control design.",
      "expert_true_value": "The main contribution is a closed-form, theorem-backed optimal relay power control policy minimizing outage probability under average relay power constraints in bidirectional AF relaying—a novel extension of truncated channel inversion principles to this setting.",
      "canon_before": "Bidirectional AF relaying had power-allocation work but optimal outage-minimizing relay power control under a long-term average power constraint was open.",
      "delta_from_canon": "Introduces a relay power allocation policy that transmits at minimum outage-free power below a cutoff and stays silent otherwise, under a long-term average power budget.",
      "position_in_field": "Communication theory paper on relay power control for bidirectional AF relay networks, not an SSI contribution.",
      "practical_value": "Useful theoretical results for relay power allocation design in wireless communications; no direct practical value for SSI or speech-related systems.",
      "axes_moved": "theory; optimization",
      "axes_unresolved": "speech-interface relevance; practical wireless-system assumptions beyond the analyzed model",
      "axes_regressed": "",
      "technical_limits": "Idealized channel model; assumes perfect CSI at relay, fixed end-node powers, no implementation details or SSI task relevance.",
      "evaluation_limits": "Proofs and numerical simulations under idealized Rayleigh fading assumptions; no real-world or SSI task evaluations.",
      "deployment_limits": "Pure communications theory result; no implementation or speech, video, or SSI interface system realized.",
      "scope_limits": "Bidirectional amplify-and-forward relay channel with fixed end-node powers and relay long-term average power constraints under Rayleigh fading.",
      "task": "",
      "input_modality": "Channel State Information (CSI) of relay links.",
      "sensor_hardware": "",
      "body_site": "",
      "output_type": "",
      "vocabulary_type": "",
      "vocabulary_size": "",
      "metrics": "Outage probability minimization under average power constraint, supported by Theorem 1 and numerical outage curves and power saving graphs (Figs. 2 and 3).",
      "evaluation_mode": "Closed-form outage probability derivation and numerical performance evaluations comparing fixed, short-term, and proposed power controls.",
      "evidence": [
        {
          "claim_type": "author_claim",
          "confidence": 0.95,
          "statement": "Derives the outage-minimizing relay power allocation for bidirectional amplify-and-forward relaying with fixed source powers and a long-term relay power constraint.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint-optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-pow.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint-optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-pow.txt",
          "section_or_location": "Abstract",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "actual_novelty",
          "confidence": 0.95,
          "statement": "The solution in Theorem 1 is a truncated channel inversion type cutoff policy that transmits at the minimum short-term relay power required to avoid outage when below a cutoff threshold, and remains silent otherwise, uniquely addressing long-term relay power constraints in bidirectional AF relaying.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint-optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-pow.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint-optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-pow.txt",
          "section_or_location": "III. OUTAGE MINIMIZATION",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "metric",
          "confidence": 0.9,
          "statement": "Minimizes the system outage probability under average relay power constraint with numerical comparing outage improvement over fixed and short-term power allocation baselines shown in Figs. 2 and 3.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint-optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-pow.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint-optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-pow.txt",
          "section_or_location": "IV. NUMERICAL EXAMPLES",
          "evidence_kind": "full_text"
        },
        {
          "claim_type": "limitation",
          "confidence": 0.9,
          "statement": "Only simulated and derived under the analytical Rayleigh block fading relay model with fixed source powers and perfect CSI at relay; no practical implementation or SSI interface system presented.",
          "evidence_source": "inputs/ssi_fulltext/text/ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint-optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-pow.txt",
          "source_ref": "inputs/ssi_fulltext/text/ssi_optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-power-constraint-optimal-power-control-for-analog-bidirectional-relaying-with-long-term-relay-pow.txt",
          "section_or_location": "V. CONCLUSION",
          "evidence_kind": "full_text"
        }
      ]
    }
  ]
}