Files changed (1) hide show
  1. app.py +81 -138
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
- Hebrew Unified NLP - Async Parallel Pipeline 2025.12.5
3
- ======================================================
4
 
5
  Architecture:
6
  Text → pysbd Sentence Breaker → async.gather([process(s) for s in sentences]) → JSON Array
@@ -287,6 +287,82 @@ LABEL_TRANSLATIONS = {
287
  # Construct state
288
  "Construct": {"en": "Construct", "he": "סמיכות"},
289
  "Free": {"en": "Free", "he": "נפרד"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  }
291
  }
292
 
@@ -982,143 +1058,10 @@ def display_ner_entities(entities: list):
982
  # Internationalization (i18n)
983
  # ============================================================================
984
 
985
- TRANSLATIONS = {
986
- "he": {
987
- "title": "🇮🇱 עיבוד שפה עברית מאוחד",
988
- "subtitle": "צינור מקבילי אסינכרוני | שבירת משפטים חוצה-פלטפורמה | ארכיטקטורת Map-Reduce",
989
- "settings": "⚙️ הגדרות",
990
- "language": "🌐 שפה",
991
- "compute_mst": "צור עץ תחבירי",
992
- "generate_speech": "צור דיבור",
993
- "tts_params": "הגדרות דיבור",
994
- "en_voice": "🇬🇧 קול אנגלית",
995
- "speed": "מהירות",
996
- "noise": "רעש",
997
- "noise_w": "רעש W",
998
- "sentence_pause": "השהיה בין משפטים (שניות)",
999
- "sentence_breaker": "מפצל משפטים",
1000
- "available_backends": "מנועים זמינים",
1001
- "load_example": "📚 טען דוגמה:",
1002
- "enter_text": "הזן טקסט (תמיכה במספר משפטים):",
1003
- "placeholder": "הקלד טקסט...",
1004
- "analyze": "🔍 נתח",
1005
- "processing": "מעבד משפטים במקביל...",
1006
- "enter_text_warning": "אנא הזן טקסט לניתוח.",
1007
- "sentences": "משפטים",
1008
- "processing_time": "זמן עיבוד",
1009
- "workers": "עובדים",
1010
- "breaker": "מפצל",
1011
- "sentence": "משפט",
1012
- "original": "מקור:",
1013
- "nikud": "ניקוד:",
1014
- "phonemes": "פונמות:",
1015
- "dep_tree": "🌳 עץ תלויות",
1016
- "all_trees": "🌳 כל העצים",
1017
- "sentence_json": "📄 JSON משפט",
1018
- "morphology": "🔠 מורפולוגיה",
1019
- "ner": "🏷️ ישויות",
1020
- "speech": "🔊 דיבור",
1021
- "no_speech": "אין דיבור - הפעל 'צור דיבור' בהגדרות",
1022
- "duration": "משך",
1023
- "json_output": "📄 JSON",
1024
- "full_json": "פלט JSON מלא",
1025
- "download_json": "⬇️ הורד JSON",
1026
- "no_entities": "אין ישויות",
1027
- "models_not_loaded": "המודלים לא נטענו.",
1028
- "token": "טוקן",
1029
- "lemma": "למה",
1030
- "pos": "חלק דיבר",
1031
- "prefixes": "תחיליות",
1032
- "head": "ראש",
1033
- "rel": "יחס",
1034
- "play_all": "נגן הכל",
1035
- "stop": "עצור",
1036
- "playing": "מנגן משפט",
1037
- "finished": "✓ סיום",
1038
- "stopped": "נעצר",
1039
- "download_merged_speech": "הורד קובץ דיבור ממוזג (WAV)",
1040
- "download_speech_files": "הורד קבצי דיבור (WAV)",
1041
- "breaking_sentences": "מפצל משפטים...",
1042
- "processing_sentence": "מעבד משפט",
1043
- "finalizing": "מסיים...",
1044
- "done": "✓ הסתיים",
1045
- "no_sentences": "לא נמצאו משפטים",
1046
- "loading_models": "טוען מודלים...",
1047
- "loading_dictabert": "טוען DictaBERT...",
1048
- "loading_phonikud": "טוען Phonikud...",
1049
- "loading_piper": "טוען Piper TTS...",
1050
- },
1051
- "en": {
1052
- "title": "🇮🇱 Hebrew Unified NLP",
1053
- "subtitle": "Async Parallel Pipeline | Cross-Platform Sentence Breaking | Map-Reduce Architecture",
1054
- "settings": "⚙️ Settings",
1055
- "language": "🌐 Language",
1056
- "compute_mst": "Create Syntax Tree",
1057
- "generate_speech": "Generate speech",
1058
- "tts_params": "TTS Parameters",
1059
- "en_voice": "🇬🇧 English Voice",
1060
- "speed": "Speed",
1061
- "noise": "Noise",
1062
- "noise_w": "Noise W",
1063
- "sentence_pause": "Pause between sentences (sec)",
1064
- "sentence_breaker": "Sentence Breaker",
1065
- "available_backends": "Available backends",
1066
- "load_example": "📚 Load Example:",
1067
- "enter_text": "Enter text (multiple sentences supported):",
1068
- "placeholder": "Enter text...",
1069
- "analyze": "🔍 Analyze",
1070
- "processing": "Processing sentences in parallel...",
1071
- "enter_text_warning": "Please enter some text to analyze.",
1072
- "sentences": "Sentences",
1073
- "processing_time": "Processing",
1074
- "workers": "Workers",
1075
- "breaker": "Breaker",
1076
- "sentence": "Sentence",
1077
- "original": "Original:",
1078
- "nikud": "Nikud:",
1079
- "phonemes": "Phonemes:",
1080
- "dep_tree": "🌳 Dependency Tree",
1081
- "all_trees": "🌳 All Trees",
1082
- "sentence_json": "📄 Sentence JSON",
1083
- "morphology": "🔠 Morphology",
1084
- "ner": "🏷️ Named Entities",
1085
- "speech": "🔊 Speech",
1086
- "no_speech": "No speech - enable 'Generate speech' in settings",
1087
- "duration": "Duration",
1088
- "json_output": "📄 JSON",
1089
- "full_json": "Full JSON Output",
1090
- "download_json": "⬇️ Download JSON",
1091
- "no_entities": "No named entities",
1092
- "models_not_loaded": "Models not loaded.",
1093
- "token": "Token",
1094
- "lemma": "Lemma",
1095
- "pos": "POS",
1096
- "prefixes": "Prefixes",
1097
- "head": "Head",
1098
- "rel": "Rel",
1099
- "play_all": "Play All",
1100
- "stop": "Stop",
1101
- "playing": "Playing sentence",
1102
- "finished": "✓ Finished",
1103
- "stopped": "Stopped",
1104
- "download_merged_speech": "Download merged speech file (WAV)",
1105
- "download_speech_files": "Download speech Files (WAV)",
1106
- "breaking_sentences": "Breaking sentences...",
1107
- "processing_sentence": "Processing sentence",
1108
- "finalizing": "Finalizing...",
1109
- "done": "✓ Done",
1110
- "no_sentences": "No sentences found",
1111
- "loading_models": "Loading models...",
1112
- "loading_dictabert": "Loading DictaBERT...",
1113
- "loading_phonikud": "Loading Phonikud...",
1114
- "loading_piper": "Loading Piper TTS...",
1115
- }
1116
- }
1117
-
1118
  def t(key: str) -> str:
1119
- """Get translated string for current language"""
1120
  lang = st.session_state.get('ui_lang', 'he')
1121
- return TRANSLATIONS.get(lang, TRANSLATIONS['en']).get(key, key)
1122
 
1123
  def is_rtl() -> bool:
1124
  """Check if current language is RTL"""
@@ -1130,7 +1073,7 @@ def is_rtl() -> bool:
1130
  # ============================================================================
1131
 
1132
  st.set_page_config(
1133
- page_title="Hebrew Unified NLP",
1134
  page_icon="🇮🇱",
1135
  layout="wide"
1136
  )
 
1
  """
2
+ Bilingual NLP + TTS - Nikud · Syntax · NER · Speech
3
+ ====================================================
4
 
5
  Architecture:
6
  Text → pysbd Sentence Breaker → async.gather([process(s) for s in sentences]) → JSON Array
 
287
  # Construct state
288
  "Construct": {"en": "Construct", "he": "סמיכות"},
289
  "Free": {"en": "Free", "he": "נפרד"}
290
+ },
291
+ "ui": {
292
+ # App header
293
+ "title": {"en": "🇮🇱🇬🇧 Bilingual NLP + TTS", "he": "🇮🇱🇬🇧 ניתוח שפה דו-לשוני + דיבור"},
294
+ "page_title": {"en": "🇮🇱🇬🇧 Bilingual NLP + TTS", "he": "🇮🇱🇬🇧 ניתוח שפה דו-לשוני + דיבור"},
295
+ "subtitle": {"en": "Nikud · Syntax · NER · Speech", "he": "ניקוד · תחביר · ישויות · דיבור"},
296
+ # Settings
297
+ "settings": {"en": "⚙️ Settings", "he": "⚙️ הגדרות"},
298
+ "language": {"en": "🌐 Language", "he": "🌐 שפה"},
299
+ "compute_mst": {"en": "Create Syntax Tree", "he": "צור עץ תחבירי"},
300
+ "generate_speech": {"en": "Generate speech", "he": "צור דיבור"},
301
+ "tts_params": {"en": "TTS Parameters", "he": "הגדרות דיבור"},
302
+ "en_voice": {"en": "🇬🇧 English Voice", "he": "🇬🇧 קול אנגלית"},
303
+ "speed": {"en": "Speed", "he": "מהירות"},
304
+ "noise": {"en": "Noise", "he": "רעש"},
305
+ "noise_w": {"en": "Noise W", "he": "רעש W"},
306
+ "sentence_pause": {"en": "Pause between sentences (sec)", "he": "השהיה בין משפטים (שניות)"},
307
+ "sentence_breaker": {"en": "Sentence Breaker", "he": "מפצל משפטים"},
308
+ "available_backends": {"en": "Available backends", "he": "מנועים זמינים"},
309
+ # Input
310
+ "load_example": {"en": "📚 Load Example:", "he": "📚 טען דוגמה:"},
311
+ "enter_text": {"en": "Enter text (multiple sentences supported):", "he": "הזן טקסט (תמיכה במספר משפטים):"},
312
+ "placeholder": {"en": "Enter text...", "he": "הקלד טקסט..."},
313
+ "analyze": {"en": "🔍 Analyze", "he": "🔍 נתח"},
314
+ "enter_text_warning": {"en": "Please enter some text to analyze.", "he": "אנא הזן טקסט לניתוח."},
315
+ # Processing
316
+ "processing": {"en": "Processing sentences in parallel...", "he": "מעבד משפטים במקביל..."},
317
+ "breaking_sentences": {"en": "Breaking sentences...", "he": "מפצל משפטים..."},
318
+ "processing_sentence": {"en": "Processing sentence", "he": "מעבד משפט"},
319
+ "finalizing": {"en": "Finalizing...", "he": "מסיים..."},
320
+ "done": {"en": "✓ Done", "he": "✓ הסתיים"},
321
+ "no_sentences": {"en": "No sentences found", "he": "לא נמצאו משפטים"},
322
+ # Metrics
323
+ "sentences": {"en": "Sentences", "he": "משפטים"},
324
+ "processing_time": {"en": "Processing", "he": "זמן עיבוד"},
325
+ "workers": {"en": "Workers", "he": "עובדים"},
326
+ "breaker": {"en": "Breaker", "he": "מפצל"},
327
+ # Results
328
+ "sentence": {"en": "Sentence", "he": "משפט"},
329
+ "original": {"en": "Original:", "he": "מקור:"},
330
+ "nikud": {"en": "Nikud:", "he": "ניקוד:"},
331
+ "phonemes": {"en": "Phonemes:", "he": "פונמות:"},
332
+ "dep_tree": {"en": "🌳 Dependency Tree", "he": "🌳 עץ תלויות"},
333
+ "all_trees": {"en": "🌳 All Trees", "he": "🌳 כל העצים"},
334
+ "sentence_json": {"en": "📄 Sentence JSON", "he": "📄 JSON משפט"},
335
+ "morphology": {"en": "🔠 Morphology", "he": "🔠 מורפולוגיה"},
336
+ "ner": {"en": "🏷️ Named Entities", "he": "🏷️ ישויות"},
337
+ "no_entities": {"en": "No named entities", "he": "אין ישויות"},
338
+ # Speech
339
+ "speech": {"en": "🔊 Speech", "he": "🔊 דיבור"},
340
+ "no_speech": {"en": "No speech - enable 'Generate speech' in settings", "he": "אין דיבור - הפעל 'צור דיבור' בהגדרות"},
341
+ "duration": {"en": "Duration", "he": "משך"},
342
+ "play_all": {"en": "Play All", "he": "נגן הכל"},
343
+ "stop": {"en": "Stop", "he": "עצור"},
344
+ "playing": {"en": "Playing sentence", "he": "מנגן משפט"},
345
+ "finished": {"en": "✓ Finished", "he": "✓ סיום"},
346
+ "stopped": {"en": "Stopped", "he": "נעצר"},
347
+ "download_merged_speech": {"en": "Download merged speech file (WAV)", "he": "הורד קובץ דיבור ממוזג (WAV)"},
348
+ "download_speech_files": {"en": "Download speech Files (WAV)", "he": "הורד קבצי דיבור (WAV)"},
349
+ # JSON
350
+ "json_output": {"en": "📄 JSON", "he": "📄 JSON"},
351
+ "full_json": {"en": "Full JSON Output", "he": "פלט JSON מלא"},
352
+ "download_json": {"en": "⬇️ Download JSON", "he": "⬇️ הורד JSON"},
353
+ # Table columns
354
+ "token": {"en": "Token", "he": "טוקן"},
355
+ "lemma": {"en": "Lemma", "he": "למה"},
356
+ "pos": {"en": "POS", "he": "חלק דיבר"},
357
+ "prefixes": {"en": "Prefixes", "he": "תחיליות"},
358
+ "head": {"en": "Head", "he": "ראש"},
359
+ "rel": {"en": "Rel", "he": "יחס"},
360
+ # Loading
361
+ "models_not_loaded": {"en": "Models not loaded.", "he": "המודלים לא נטענו."},
362
+ "loading_models": {"en": "Loading models...", "he": "טוען מודלים..."},
363
+ "loading_dictabert": {"en": "Loading DictaBERT...", "he": "טוען DictaBERT..."},
364
+ "loading_phonikud": {"en": "Loading Phonikud...", "he": "טוען Phonikud..."},
365
+ "loading_piper": {"en": "Loading Piper TTS...", "he": "טוען Piper TTS..."},
366
  }
367
  }
368
 
 
1058
  # Internationalization (i18n)
1059
  # ============================================================================
1060
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1061
  def t(key: str) -> str:
1062
+ """Get translated UI string for current language."""
1063
  lang = st.session_state.get('ui_lang', 'he')
1064
+ return get_label("ui", key, lang)
1065
 
1066
  def is_rtl() -> bool:
1067
  """Check if current language is RTL"""
 
1073
  # ============================================================================
1074
 
1075
  st.set_page_config(
1076
+ page_title="Bilingual NLP + TTS",
1077
  page_icon="🇮🇱",
1078
  layout="wide"
1079
  )