Update app.py
#2
by VRDate - opened
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
====================================================
|
| 4 |
|
| 5 |
Architecture:
|
| 6 |
Text → pysbd Sentence Breaker → async.gather([process(s) for s in sentences]) → JSON Array
|
|
@@ -287,6 +287,82 @@ LABEL_TRANSLATIONS = {
|
|
| 287 |
# Construct state
|
| 288 |
"Construct": {"en": "Construct", "he": "סמיכות"},
|
| 289 |
"Free": {"en": "Free", "he": "נפרד"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
}
|
| 291 |
}
|
| 292 |
|
|
@@ -982,143 +1058,10 @@ def display_ner_entities(entities: list):
|
|
| 982 |
# Internationalization (i18n)
|
| 983 |
# ============================================================================
|
| 984 |
|
| 985 |
-
TRANSLATIONS = {
|
| 986 |
-
"he": {
|
| 987 |
-
"title": "🇮🇱 עיבוד שפה עברית מאוחד",
|
| 988 |
-
"subtitle": "צינור מקבילי אסינכרוני | שבירת משפטים חוצה-פלטפורמה | ארכיטקטורת Map-Reduce",
|
| 989 |
-
"settings": "⚙️ הגדרות",
|
| 990 |
-
"language": "🌐 שפה",
|
| 991 |
-
"compute_mst": "צור עץ תחבירי",
|
| 992 |
-
"generate_speech": "צור דיבור",
|
| 993 |
-
"tts_params": "הגדרות דיבור",
|
| 994 |
-
"en_voice": "🇬🇧 קול אנגלית",
|
| 995 |
-
"speed": "מהירות",
|
| 996 |
-
"noise": "רעש",
|
| 997 |
-
"noise_w": "רעש W",
|
| 998 |
-
"sentence_pause": "השהיה בין משפטים (שניות)",
|
| 999 |
-
"sentence_breaker": "מפצל משפטים",
|
| 1000 |
-
"available_backends": "מנועים זמינים",
|
| 1001 |
-
"load_example": "📚 טען דוגמה:",
|
| 1002 |
-
"enter_text": "הזן טקסט (תמיכה במספר משפטים):",
|
| 1003 |
-
"placeholder": "הקלד טקסט...",
|
| 1004 |
-
"analyze": "🔍 נתח",
|
| 1005 |
-
"processing": "מעבד משפטים במקביל...",
|
| 1006 |
-
"enter_text_warning": "אנא הזן טקסט לניתוח.",
|
| 1007 |
-
"sentences": "משפטים",
|
| 1008 |
-
"processing_time": "זמן עיבוד",
|
| 1009 |
-
"workers": "עובדים",
|
| 1010 |
-
"breaker": "מפצל",
|
| 1011 |
-
"sentence": "משפט",
|
| 1012 |
-
"original": "מקור:",
|
| 1013 |
-
"nikud": "ניקוד:",
|
| 1014 |
-
"phonemes": "פונמות:",
|
| 1015 |
-
"dep_tree": "🌳 עץ תלויות",
|
| 1016 |
-
"all_trees": "🌳 כל העצים",
|
| 1017 |
-
"sentence_json": "📄 JSON משפט",
|
| 1018 |
-
"morphology": "🔠 מורפולוגיה",
|
| 1019 |
-
"ner": "🏷️ ישויות",
|
| 1020 |
-
"speech": "🔊 דיבור",
|
| 1021 |
-
"no_speech": "אין דיבור - הפעל 'צור דיבור' בהגדרות",
|
| 1022 |
-
"duration": "משך",
|
| 1023 |
-
"json_output": "📄 JSON",
|
| 1024 |
-
"full_json": "פלט JSON מלא",
|
| 1025 |
-
"download_json": "⬇️ הורד JSON",
|
| 1026 |
-
"no_entities": "אין ישויות",
|
| 1027 |
-
"models_not_loaded": "המודלים לא נטענו.",
|
| 1028 |
-
"token": "טוקן",
|
| 1029 |
-
"lemma": "למה",
|
| 1030 |
-
"pos": "חלק דיבר",
|
| 1031 |
-
"prefixes": "תחיליות",
|
| 1032 |
-
"head": "ראש",
|
| 1033 |
-
"rel": "יחס",
|
| 1034 |
-
"play_all": "נגן הכל",
|
| 1035 |
-
"stop": "עצור",
|
| 1036 |
-
"playing": "מנגן משפט",
|
| 1037 |
-
"finished": "✓ סיום",
|
| 1038 |
-
"stopped": "נעצר",
|
| 1039 |
-
"download_merged_speech": "הורד קובץ דיבור ממוזג (WAV)",
|
| 1040 |
-
"download_speech_files": "הורד קבצי דיבור (WAV)",
|
| 1041 |
-
"breaking_sentences": "מפצל משפטים...",
|
| 1042 |
-
"processing_sentence": "מעבד משפט",
|
| 1043 |
-
"finalizing": "מסיים...",
|
| 1044 |
-
"done": "✓ הסתיים",
|
| 1045 |
-
"no_sentences": "לא נמצאו משפטים",
|
| 1046 |
-
"loading_models": "טוען מודלים...",
|
| 1047 |
-
"loading_dictabert": "טוען DictaBERT...",
|
| 1048 |
-
"loading_phonikud": "טוען Phonikud...",
|
| 1049 |
-
"loading_piper": "טוען Piper TTS...",
|
| 1050 |
-
},
|
| 1051 |
-
"en": {
|
| 1052 |
-
"title": "🇮🇱 Hebrew Unified NLP",
|
| 1053 |
-
"subtitle": "Async Parallel Pipeline | Cross-Platform Sentence Breaking | Map-Reduce Architecture",
|
| 1054 |
-
"settings": "⚙️ Settings",
|
| 1055 |
-
"language": "🌐 Language",
|
| 1056 |
-
"compute_mst": "Create Syntax Tree",
|
| 1057 |
-
"generate_speech": "Generate speech",
|
| 1058 |
-
"tts_params": "TTS Parameters",
|
| 1059 |
-
"en_voice": "🇬🇧 English Voice",
|
| 1060 |
-
"speed": "Speed",
|
| 1061 |
-
"noise": "Noise",
|
| 1062 |
-
"noise_w": "Noise W",
|
| 1063 |
-
"sentence_pause": "Pause between sentences (sec)",
|
| 1064 |
-
"sentence_breaker": "Sentence Breaker",
|
| 1065 |
-
"available_backends": "Available backends",
|
| 1066 |
-
"load_example": "📚 Load Example:",
|
| 1067 |
-
"enter_text": "Enter text (multiple sentences supported):",
|
| 1068 |
-
"placeholder": "Enter text...",
|
| 1069 |
-
"analyze": "🔍 Analyze",
|
| 1070 |
-
"processing": "Processing sentences in parallel...",
|
| 1071 |
-
"enter_text_warning": "Please enter some text to analyze.",
|
| 1072 |
-
"sentences": "Sentences",
|
| 1073 |
-
"processing_time": "Processing",
|
| 1074 |
-
"workers": "Workers",
|
| 1075 |
-
"breaker": "Breaker",
|
| 1076 |
-
"sentence": "Sentence",
|
| 1077 |
-
"original": "Original:",
|
| 1078 |
-
"nikud": "Nikud:",
|
| 1079 |
-
"phonemes": "Phonemes:",
|
| 1080 |
-
"dep_tree": "🌳 Dependency Tree",
|
| 1081 |
-
"all_trees": "🌳 All Trees",
|
| 1082 |
-
"sentence_json": "📄 Sentence JSON",
|
| 1083 |
-
"morphology": "🔠 Morphology",
|
| 1084 |
-
"ner": "🏷️ Named Entities",
|
| 1085 |
-
"speech": "🔊 Speech",
|
| 1086 |
-
"no_speech": "No speech - enable 'Generate speech' in settings",
|
| 1087 |
-
"duration": "Duration",
|
| 1088 |
-
"json_output": "📄 JSON",
|
| 1089 |
-
"full_json": "Full JSON Output",
|
| 1090 |
-
"download_json": "⬇️ Download JSON",
|
| 1091 |
-
"no_entities": "No named entities",
|
| 1092 |
-
"models_not_loaded": "Models not loaded.",
|
| 1093 |
-
"token": "Token",
|
| 1094 |
-
"lemma": "Lemma",
|
| 1095 |
-
"pos": "POS",
|
| 1096 |
-
"prefixes": "Prefixes",
|
| 1097 |
-
"head": "Head",
|
| 1098 |
-
"rel": "Rel",
|
| 1099 |
-
"play_all": "Play All",
|
| 1100 |
-
"stop": "Stop",
|
| 1101 |
-
"playing": "Playing sentence",
|
| 1102 |
-
"finished": "✓ Finished",
|
| 1103 |
-
"stopped": "Stopped",
|
| 1104 |
-
"download_merged_speech": "Download merged speech file (WAV)",
|
| 1105 |
-
"download_speech_files": "Download speech Files (WAV)",
|
| 1106 |
-
"breaking_sentences": "Breaking sentences...",
|
| 1107 |
-
"processing_sentence": "Processing sentence",
|
| 1108 |
-
"finalizing": "Finalizing...",
|
| 1109 |
-
"done": "✓ Done",
|
| 1110 |
-
"no_sentences": "No sentences found",
|
| 1111 |
-
"loading_models": "Loading models...",
|
| 1112 |
-
"loading_dictabert": "Loading DictaBERT...",
|
| 1113 |
-
"loading_phonikud": "Loading Phonikud...",
|
| 1114 |
-
"loading_piper": "Loading Piper TTS...",
|
| 1115 |
-
}
|
| 1116 |
-
}
|
| 1117 |
-
|
| 1118 |
def t(key: str) -> str:
|
| 1119 |
-
"""Get translated string for current language"""
|
| 1120 |
lang = st.session_state.get('ui_lang', 'he')
|
| 1121 |
-
return
|
| 1122 |
|
| 1123 |
def is_rtl() -> bool:
|
| 1124 |
"""Check if current language is RTL"""
|
|
@@ -1130,7 +1073,7 @@ def is_rtl() -> bool:
|
|
| 1130 |
# ============================================================================
|
| 1131 |
|
| 1132 |
st.set_page_config(
|
| 1133 |
-
page_title="
|
| 1134 |
page_icon="🇮🇱",
|
| 1135 |
layout="wide"
|
| 1136 |
)
|
|
|
|
| 1 |
"""
|
| 2 |
+
Bilingual NLP + TTS - Nikud · Syntax · NER · Speech
|
| 3 |
+
====================================================
|
| 4 |
|
| 5 |
Architecture:
|
| 6 |
Text → pysbd Sentence Breaker → async.gather([process(s) for s in sentences]) → JSON Array
|
|
|
|
| 287 |
# Construct state
|
| 288 |
"Construct": {"en": "Construct", "he": "סמיכות"},
|
| 289 |
"Free": {"en": "Free", "he": "נפרד"}
|
| 290 |
+
},
|
| 291 |
+
"ui": {
|
| 292 |
+
# App header
|
| 293 |
+
"title": {"en": "🇮🇱🇬🇧 Bilingual NLP + TTS", "he": "🇮🇱🇬🇧 ניתוח שפה דו-לשוני + דיבור"},
|
| 294 |
+
"page_title": {"en": "🇮🇱🇬🇧 Bilingual NLP + TTS", "he": "🇮🇱🇬🇧 ניתוח שפה דו-לשוני + דיבור"},
|
| 295 |
+
"subtitle": {"en": "Nikud · Syntax · NER · Speech", "he": "ניקוד · תחביר · ישויות · דיבור"},
|
| 296 |
+
# Settings
|
| 297 |
+
"settings": {"en": "⚙️ Settings", "he": "⚙️ הגדרות"},
|
| 298 |
+
"language": {"en": "🌐 Language", "he": "🌐 שפה"},
|
| 299 |
+
"compute_mst": {"en": "Create Syntax Tree", "he": "צור עץ תחבירי"},
|
| 300 |
+
"generate_speech": {"en": "Generate speech", "he": "צור דיבור"},
|
| 301 |
+
"tts_params": {"en": "TTS Parameters", "he": "הגדרות דיבור"},
|
| 302 |
+
"en_voice": {"en": "🇬🇧 English Voice", "he": "🇬🇧 קול אנגלית"},
|
| 303 |
+
"speed": {"en": "Speed", "he": "מהירות"},
|
| 304 |
+
"noise": {"en": "Noise", "he": "רעש"},
|
| 305 |
+
"noise_w": {"en": "Noise W", "he": "רעש W"},
|
| 306 |
+
"sentence_pause": {"en": "Pause between sentences (sec)", "he": "השהיה בין משפטים (שניות)"},
|
| 307 |
+
"sentence_breaker": {"en": "Sentence Breaker", "he": "מפצל משפטים"},
|
| 308 |
+
"available_backends": {"en": "Available backends", "he": "מנועים זמינים"},
|
| 309 |
+
# Input
|
| 310 |
+
"load_example": {"en": "📚 Load Example:", "he": "📚 טען דוגמה:"},
|
| 311 |
+
"enter_text": {"en": "Enter text (multiple sentences supported):", "he": "הזן טקסט (תמיכה במספר משפטים):"},
|
| 312 |
+
"placeholder": {"en": "Enter text...", "he": "הקלד טקסט..."},
|
| 313 |
+
"analyze": {"en": "🔍 Analyze", "he": "🔍 נתח"},
|
| 314 |
+
"enter_text_warning": {"en": "Please enter some text to analyze.", "he": "אנא הזן טקסט לניתוח."},
|
| 315 |
+
# Processing
|
| 316 |
+
"processing": {"en": "Processing sentences in parallel...", "he": "מעבד משפטים במקביל..."},
|
| 317 |
+
"breaking_sentences": {"en": "Breaking sentences...", "he": "מפצל משפטים..."},
|
| 318 |
+
"processing_sentence": {"en": "Processing sentence", "he": "מעבד משפט"},
|
| 319 |
+
"finalizing": {"en": "Finalizing...", "he": "מסיים..."},
|
| 320 |
+
"done": {"en": "✓ Done", "he": "✓ הסתיים"},
|
| 321 |
+
"no_sentences": {"en": "No sentences found", "he": "לא נמצאו משפטים"},
|
| 322 |
+
# Metrics
|
| 323 |
+
"sentences": {"en": "Sentences", "he": "משפטים"},
|
| 324 |
+
"processing_time": {"en": "Processing", "he": "זמן עיבוד"},
|
| 325 |
+
"workers": {"en": "Workers", "he": "עובדים"},
|
| 326 |
+
"breaker": {"en": "Breaker", "he": "מפצל"},
|
| 327 |
+
# Results
|
| 328 |
+
"sentence": {"en": "Sentence", "he": "משפט"},
|
| 329 |
+
"original": {"en": "Original:", "he": "מקור:"},
|
| 330 |
+
"nikud": {"en": "Nikud:", "he": "ניקוד:"},
|
| 331 |
+
"phonemes": {"en": "Phonemes:", "he": "פונמות:"},
|
| 332 |
+
"dep_tree": {"en": "🌳 Dependency Tree", "he": "🌳 עץ תלויות"},
|
| 333 |
+
"all_trees": {"en": "🌳 All Trees", "he": "🌳 כל העצים"},
|
| 334 |
+
"sentence_json": {"en": "📄 Sentence JSON", "he": "📄 JSON משפט"},
|
| 335 |
+
"morphology": {"en": "🔠 Morphology", "he": "🔠 מורפולוגיה"},
|
| 336 |
+
"ner": {"en": "🏷️ Named Entities", "he": "🏷️ ישויות"},
|
| 337 |
+
"no_entities": {"en": "No named entities", "he": "אין ישויות"},
|
| 338 |
+
# Speech
|
| 339 |
+
"speech": {"en": "🔊 Speech", "he": "🔊 דיבור"},
|
| 340 |
+
"no_speech": {"en": "No speech - enable 'Generate speech' in settings", "he": "אין דיבור - הפעל 'צור דיבור' בהגדרות"},
|
| 341 |
+
"duration": {"en": "Duration", "he": "משך"},
|
| 342 |
+
"play_all": {"en": "Play All", "he": "נגן הכל"},
|
| 343 |
+
"stop": {"en": "Stop", "he": "עצור"},
|
| 344 |
+
"playing": {"en": "Playing sentence", "he": "מנגן משפט"},
|
| 345 |
+
"finished": {"en": "✓ Finished", "he": "✓ סיום"},
|
| 346 |
+
"stopped": {"en": "Stopped", "he": "נעצר"},
|
| 347 |
+
"download_merged_speech": {"en": "Download merged speech file (WAV)", "he": "הורד קובץ דיבור ממוזג (WAV)"},
|
| 348 |
+
"download_speech_files": {"en": "Download speech Files (WAV)", "he": "הורד קבצי דיבור (WAV)"},
|
| 349 |
+
# JSON
|
| 350 |
+
"json_output": {"en": "📄 JSON", "he": "📄 JSON"},
|
| 351 |
+
"full_json": {"en": "Full JSON Output", "he": "פלט JSON מלא"},
|
| 352 |
+
"download_json": {"en": "⬇️ Download JSON", "he": "⬇️ הורד JSON"},
|
| 353 |
+
# Table columns
|
| 354 |
+
"token": {"en": "Token", "he": "טוקן"},
|
| 355 |
+
"lemma": {"en": "Lemma", "he": "למה"},
|
| 356 |
+
"pos": {"en": "POS", "he": "חלק דיבר"},
|
| 357 |
+
"prefixes": {"en": "Prefixes", "he": "תחיליות"},
|
| 358 |
+
"head": {"en": "Head", "he": "ראש"},
|
| 359 |
+
"rel": {"en": "Rel", "he": "יחס"},
|
| 360 |
+
# Loading
|
| 361 |
+
"models_not_loaded": {"en": "Models not loaded.", "he": "המודלים לא נטענו."},
|
| 362 |
+
"loading_models": {"en": "Loading models...", "he": "טוען מודלים..."},
|
| 363 |
+
"loading_dictabert": {"en": "Loading DictaBERT...", "he": "טוען DictaBERT..."},
|
| 364 |
+
"loading_phonikud": {"en": "Loading Phonikud...", "he": "טוען Phonikud..."},
|
| 365 |
+
"loading_piper": {"en": "Loading Piper TTS...", "he": "טוען Piper TTS..."},
|
| 366 |
}
|
| 367 |
}
|
| 368 |
|
|
|
|
| 1058 |
# Internationalization (i18n)
|
| 1059 |
# ============================================================================
|
| 1060 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1061 |
def t(key: str) -> str:
|
| 1062 |
+
"""Get translated UI string for current language."""
|
| 1063 |
lang = st.session_state.get('ui_lang', 'he')
|
| 1064 |
+
return get_label("ui", key, lang)
|
| 1065 |
|
| 1066 |
def is_rtl() -> bool:
|
| 1067 |
"""Check if current language is RTL"""
|
|
|
|
| 1073 |
# ============================================================================
|
| 1074 |
|
| 1075 |
st.set_page_config(
|
| 1076 |
+
page_title="Bilingual NLP + TTS",
|
| 1077 |
page_icon="🇮🇱",
|
| 1078 |
layout="wide"
|
| 1079 |
)
|