WiktionaryEN

Running

App Files Files Community

cstr commited on 21 days ago

Commit

3beabc6

verified ·

1 Parent(s): 85c7f72

fix null entries in sql db, formatted ui output

Browse files

Files changed (1) hide show

app.py +255 -32

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # ============================================================================
-# ENGLISH LINGUISTICS HUB (CONSOLIDATED APP V23-EN)
 #
-# This script adapts the German Linguistics Hub for English analysis,
 # adding NLTK, Stanza, TextBlob, HanTa(EN), OEWN, and OpenBLP.
 # It maintains the exact same JSON output structure as the German app.
 #
@@ -1568,7 +1568,8 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
     spacy_pos_hint, spacy_lemma_hint = None, None
     try:
-        nlp_en = spacy_load_spacy_model("en_core_web_md")
         if nlp_en:
             doc = nlp_en(word)
             token = doc[0]
@@ -1590,15 +1591,18 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
         if spacy_pos_hint and wikt_pos == spacy_pos_hint:
             if spacy_lemma_hint and wikt_lemma == spacy_lemma_hint: return 1
             return 2
-        if wikt_lemma.lower() == word.lower(): return 3
         return 4
     wiktionary_reports.sort(key=get_priority_score)
     word_lower = word.lower()
     for wikt_report in wiktionary_reports:
         pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
-        lemma = wikt_report.get("lemma", word)
-        pos_title = wikt_report.get("pos_title", "")
         inflections_wikt_block = {
             "base_form": lemma,
@@ -1609,7 +1613,9 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
         pattern_block = {}
         if PATTERN_EN_AVAILABLE:
             try:
                 use_word = word if "form" in pos_title.lower() else lemma
                 if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
                 elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
                 elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(use_word)
@@ -1625,12 +1631,14 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
             "semantics_combined": semantics_block,
             "wiktionary_metadata": {
                  "pos_title": pos_title,
-                 "etymology": wikt_report.get("etymology_text"),
-                 "pronunciation": wikt_report.get("sounds"),
             }
         }
         is_valid = False
         is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
         if lemma.lower() == word_lower: is_valid = True
@@ -2188,6 +2196,212 @@ def analyze_word_encyclopedia(word: str, top_n_value: Optional[float] = 0, engin
     }
 # ============================================================================
 # 8. GRADIO UI CREATION (Adapted for English)
 # ============================================================================
@@ -2367,34 +2581,39 @@ def create_combined_tab():
         analyze_button = gr.Button("Run Comprehensive Analysis", variant="primary")
     status_output = gr.Markdown(value="", visible=True)
-    output = gr.JSON(label="Comprehensive Analysis (JSON)")
-    def run_analysis_with_status(text, top_n):
         try:
             status = "🔄 Analyzing..."
-            yield status, {}
             result = comprehensive_english_analysis(text, top_n)
             status = f"✅ Analysis complete! Found {len(result.get('lemma_deep_dive', {}))} lemmas."
-            yield status, result
         except Exception as e:
             error_status = f"❌ Error: {str(e)}"
-            yield error_status, {"error": str(e), "traceback": traceback.format_exc()}
     analyze_button.click(
-        fn=run_analysis_with_status,
         inputs=[text_input, top_n_number],
-        outputs=[status_output, output],
         api_name="comprehensive_analysis"
     )
     gr.Examples(
-        [["The cat sleeps on the table.", 3],
-         ["This is a houze.", 0],
-         ["I am running quickly.", 3],
-         ["The gardener is planting a tree.", 5]],
         inputs=[text_input, top_n_number],
-        outputs=[status_output, output],
-        fn=run_analysis_with_status,
         cache_examples=False
     )
@@ -2416,7 +2635,7 @@ def create_word_encyclopedia_tab():
             )
             engine_radio = gr.Radio(
-                label="Select Analysis Engine (will auto-fallback)",
                 choices=[
                     ("Wiktionary (Default)", "wiktionary"),
                     ("HanTa (EN)", "hanta"),
@@ -2430,24 +2649,28 @@ def create_word_encyclopedia_tab():
         analyze_button = gr.Button("Analyze Word", variant="primary")
-    output = gr.JSON(label="Word Encyclopedia Analysis (JSON)")
     analyze_button.click(
-        fn=lambda word, top_n, engine: analyze_word_encyclopedia(word, top_n, engine, 'en'),
         inputs=[word_input, top_n_number, engine_radio],
-        outputs=[output],
         api_name="analyze_word"
     )
     gr.Examples(
-        [["run", 3, "wiktionary"],
-         ["water", 0, "wiktionary"],
-         ["fast", 3, "hanta"],
-         ["ran", 0, "stanza"],
-         ["beautiful", 0, "nltk"]],
         inputs=[word_input, top_n_number, engine_radio],
-        outputs=[output],
-        fn=lambda word, top_n, engine: analyze_word_encyclopedia(word, top_n, engine, 'en'),
         cache_examples=False
     )

 # ============================================================================
+# ENGLISH LINGUISTICS HUB (CONSOLIDATED APP V24-EN)
 #
+# This script provides a comprehensive Linguistics Hub for English analysis,
 # adding NLTK, Stanza, TextBlob, HanTa(EN), OEWN, and OpenBLP.
 # It maintains the exact same JSON output structure as the German app.
 #
     spacy_pos_hint, spacy_lemma_hint = None, None
     try:
+        # Quick heuristic to help sort results if multiple entries exist
+        nlp_en = SPACY_MODELS.get("en")
         if nlp_en:
             doc = nlp_en(word)
             token = doc[0]
         if spacy_pos_hint and wikt_pos == spacy_pos_hint:
             if spacy_lemma_hint and wikt_lemma == spacy_lemma_hint: return 1
             return 2
+        if wikt_lemma and wikt_lemma.lower() == word.lower(): return 3
         return 4
     wiktionary_reports.sort(key=get_priority_score)
     word_lower = word.lower()
     for wikt_report in wiktionary_reports:
+        # --- FIX: Safe Extraction of DB Fields ---
         pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
+        lemma = wikt_report.get("lemma") or word # Fallback if None
+        # FORCE STRING: Use 'or ""' to handle DB NULLs preventing "NoneType is not iterable"
+        pos_title = wikt_report.get("pos_title") or ""
         inflections_wikt_block = {
             "base_form": lemma,
         pattern_block = {}
         if PATTERN_EN_AVAILABLE:
             try:
+                # Safe check now that pos_title is definitely a string
                 use_word = word if "form" in pos_title.lower() else lemma
                 if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
                 elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
                 elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(use_word)
             "semantics_combined": semantics_block,
             "wiktionary_metadata": {
                  "pos_title": pos_title,
+                 "etymology": wikt_report.get("etymology_text") or "",
+                 "pronunciation": wikt_report.get("sounds") or "",
             }
         }
+        # Validation Logic
         is_valid = False
+        # Safe check now that pos_title is definitely a string
         is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
         if lemma.lower() == word_lower: is_valid = True
     }
+# ============================================================================
+# 7.5 VISUALIZATION & HTML HELPERS (NEW)
+# ============================================================================
+HTML_CSS = """
+<style>
+    .ling-card { font-family: 'Source Sans Pro', sans-serif; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px; margin-bottom: 15px; background: #fff; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
+    .ling-header { display: flex; align-items: baseline; margin-bottom: 10px; border-bottom: 2px solid #f3f4f6; padding-bottom: 5px; }
+    .ling-lemma { font-size: 1.5em; font-weight: bold; color: #1f2937; margin-right: 10px; }
+    .ling-pos { font-size: 0.9em; text-transform: uppercase; font-weight: bold; padding: 2px 6px; border-radius: 4px; color: #fff; }
+    .pos-noun { background-color: #3b82f6; } /* Blue */
+    .pos-verb { background-color: #10b981; } /* Green */
+    .pos-adj { background-color: #f59e0b; }  /* Amber */
+    .pos-adv { background-color: #8b5cf6; }  /* Purple */
+    .ling-section { margin-top: 12px; }
+    .ling-subtitle { font-size: 0.95em; font-weight: bold; color: #6b7280; text-transform: uppercase; margin-bottom: 5px; letter-spacing: 0.05em; }
+    .inflection-table { width: 100%; font-size: 0.9em; border-collapse: collapse; }
+    .inflection-table td { padding: 4px 8px; border-bottom: 1px solid #f3f4f6; }
+    .inflection-label { color: #6b7280; font-weight: 600; width: 40%; }
+    .sense-item { margin-bottom: 6px; line-height: 1.4; font-size: 0.95em; }
+    .source-badge { display: inline-block; font-size: 0.7em; padding: 0 4px; border-radius: 3px; border: 1px solid #e5e7eb; margin-right: 5px; vertical-align: middle; }
+    .src-wikt { background: #fff1f2; color: #be123c; border-color: #fda4af; }
+    .src-oewn { background: #eff6ff; color: #1d4ed8; border-color: #93c5fd; }
+    .rel-chip { display: inline-block; background: #f3f4f6; padding: 2px 8px; border-radius: 12px; font-size: 0.85em; margin: 2px; border: 1px solid #e5e7eb; }
+    .rel-type { color: #6b7280; font-size: 0.8em; margin-right: 3px; font-weight: bold;}
+    .grammar-alert { padding: 10px; border-radius: 6px; margin-bottom: 20px; border-left: 4px solid; }
+    .alert-green { background: #f0fdf4; border-color: #22c55e; color: #166534; }
+    .alert-red { background: #fef2f2; border-color: #ef4444; color: #991b1b; }
+    details > summary { cursor: pointer; padding: 10px; background: #f9fafb; border-radius: 6px; font-weight: 600; margin-bottom: 5px; }
+    details[open] > summary { background: #e5e7eb; }
+</style>
+"""
+def _format_word_analysis_html(data: Dict[str, Any]) -> str:
+    """ Generates HTML for a single word analysis result. """
+    if not data or "analysis" not in data:
+        return f"{HTML_CSS}<div class='ling-card'>No analysis data available. {data.get('info', '')}</div>"
+    html = HTML_CSS
+    analysis = data["analysis"]
+    # Iterate over POS (noun, verb, etc.)
+    for pos_key, entries in analysis.items():
+        if not entries: continue
+        entry = entries[0] # Take best candidate
+        # --- Header ---
+        pos_class = f"pos-{pos_key}" if pos_key in ["noun", "verb", "adj", "adv"] else "pos-noun"
+        # Try to find a lemma from one of the sub-blocks
+        lemma = entry.get("inflections_wiktionary", {}).get("base_form") or \
+                entry.get("inflections_pattern", {}).get("base_form") or \
+                entry.get("semantics_combined", {}).get("lemma") or \
+                data.get("input_word")
+        html += f"""
+        <div class="ling-card">
+            <div class="ling-header">
+                <span class="ling-lemma">{lemma}</span>
+                <span class="ling-pos {pos_class}">{pos_key}</span>
+            </div>
+        """
+        # --- Inflections Section ---
+        html += "<div class='ling-section'><div class='ling-subtitle'>Morphology & Inflections</div>"
+        html += "<table class='inflection-table'>"
+        pat = entry.get("inflections_pattern", {})
+        wikt = entry.get("inflections_wiktionary", {})
+        # Noun Logic
+        if pos_key == 'noun':
+            if pat:
+                html += f"<tr><td class='inflection-label'>Singular</td><td>{pat.get('singular', '-')}</td></tr>"
+                html += f"<tr><td class='inflection-label'>Plural</td><td>{pat.get('plural', '-')}</td></tr>"
+                html += f"<tr><td class='inflection-label'>Context</td><td>{pat.get('article', '-')}</td></tr>"
+        # Verb Logic
+        elif pos_key == 'verb':
+            cj = pat.get('conjugation', {})
+            if cj:
+                pres = cj.get('Present', {})
+                past = cj.get('Past', {})
+                html += f"<tr><td class='inflection-label'>Infinitive</td><td>{pat.get('infinitive', lemma)}</td></tr>"
+                html += f"<tr><td class='inflection-label'>3rd Person (He/She)</td><td>{pres.get('he/she (3sg)', '-')}</td></tr>"
+                html += f"<tr><td class='inflection-label'>Past Simple</td><td>{past.get('General', '-')}</td></tr>"
+                parts = pat.get('participles', {})
+                html += f"<tr><td class='inflection-label'>Participle (Ing)</td><td>{parts.get('Present Participle (gerund)', '-')}</td></tr>"
+                html += f"<tr><td class='inflection-label'>Participle (Past)</td><td>{parts.get('Past Participle', '-')}</td></tr>"
+        # Adjective Logic
+        elif pos_key == 'adjective':
+            gr = pat.get('grading', {})
+            if gr:
+                 html += f"<tr><td class='inflection-label'>Positive</td><td>{gr.get('Positive', '-')}</td></tr>"
+                 html += f"<tr><td class='inflection-label'>Comparative</td><td>{gr.get('Comparative', '-')}</td></tr>"
+                 html += f"<tr><td class='inflection-label'>Superlative</td><td>{gr.get('Superlative', '-')}</td></tr>"
+        # Wiktionary Forms Fallback
+        if wikt and wikt.get("forms_list"):
+            forms_str = ", ".join([f['form_text'] for f in wikt['forms_list'][:5]])
+            html += f"<tr><td class='inflection-label'>Other Forms (Wikt)</td><td>{forms_str}...</td></tr>"
+        html += "</table></div>"
+        # --- Semantics Section ---
+        sem = entry.get("semantics_combined", {})
+        html += "<div class='ling-section'><div class='ling-subtitle'>Definitions & Senses</div>"
+        # Wiktionary Senses
+        wikt_senses = sem.get("wiktionary_senses", [])
+        for s in wikt_senses[:3]: # Limit to top 3 for UI
+            gloss = s.get("definition", "").replace(";", "<br>")
+            html += f"<div class='sense-item'><span class='source-badge src-wikt'>Wikt</span> {gloss}</div>"
+        # OEWN Senses
+        oewn_senses = sem.get("odenet_senses", [])
+        for s in oewn_senses[:3]:
+            defi = s.get("definition", "")
+            html += f"<div class='sense-item'><span class='source-badge src-oewn'>OEWN</span> {defi}</div>"
+        html += "</div>"
+        # --- Relations Section (ConceptNet) ---
+        rels = sem.get("conceptnet_relations", [])
+        if rels:
+            html += "<div class='ling-section'><div class='ling-subtitle'>Knowledge Graph (Top 5)</div>"
+            html += "<div>"
+            for r in rels[:5]:
+                rel_name = r.get("relation", "Related")
+                target = r.get("other_node") or r.get("surface")
+                html += f"<span class='rel-chip'><span class='rel-type'>{rel_name}:</span> {target}</span>"
+            html += "</div></div>"
+        html += "</div>" # End Card
+    return html
+def _format_comprehensive_html(data: Dict[str, Any]) -> str:
+    """ Generates HTML for the comprehensive sentence analysis. """
+    if "error" in data:
+        return f"<div style='color:red'>{data['error']}</div>"
+    html = HTML_CSS
+    # 1. Grammar Check Banner
+    gc = data.get("grammar_check", [])
+    if isinstance(gc, list) and len(gc) == 1 and gc[0].get("status") == "perfect":
+        html += "<div class='grammar-alert alert-green'><strong>✓ Grammar Check Passed:</strong> No obvious errors detected.</div>"
+    elif isinstance(gc, list) and gc:
+        html += "<div class='grammar-alert alert-red'><strong>⚠ Grammar Issues Detected:</strong><br>"
+        for err in gc:
+            msg = err.get("message", "Error")
+            bad = err.get("incorrect_text", "")
+            html += f"• {msg} (in: '<em>{bad}</em>')<br>"
+        html += "</div>"
+    # 2. Lemma Deep Dive Accordion
+    deep_dive = data.get("lemma_deep_dive", {})
+    if not deep_dive:
+        html += "<p>No deep analysis available.</p>"
+    else:
+        html += "<h3>Word-by-Word Analysis</h3>"
+        for lemma, details in deep_dive.items():
+            # Construct a fake "single word" object to reuse the formatting function
+            # We need to reshape the deep_dive structure slightly to match the expected format
+            # The deep dive has keys "inflection_analysis" and "semantic_analysis".
+            # We need to map this back to { "analysis": { "pos": [ entry... ] } }
+            # This is a bit tricky because deep_dive separates inflection from semantics
+            # while the word analyzer groups them by POS entry.
+            # We will generate a simplified view here.
+            html += f"<details><summary>{lemma}</summary>"
+            inflections = details.get("inflection_analysis", {})
+            semantics = details.get("semantic_analysis", {})
+            # We need to guess the POS keys present
+            all_keys = set([k.split('_')[0] for k in inflections.keys()])
+            reconstructed_data = {"analysis": {}}
+            for pos in all_keys:
+                entry = {
+                    "inflections_wiktionary": inflections.get(f"{pos}_wiktionary"),
+                    "inflections_pattern": inflections.get(f"{pos}_pattern"),
+                    "semantics_combined": {
+                        "lemma": lemma,
+                        "wiktionary_senses": [s for s in semantics.get(f"{pos}_senses", []) if s.get('source') == 'wiktionary'],
+                        "odenet_senses": [s for s in semantics.get(f"{pos}_senses", []) if s.get('source') == 'oewn'],
+                        "conceptnet_relations": semantics.get("conceptnet_relations", [])
+                    }
+                }
+                reconstructed_data["analysis"][pos] = [entry]
+            html += _format_word_analysis_html(reconstructed_data)
+            html += "</details>"
+    return html
 # ============================================================================
 # 8. GRADIO UI CREATION (Adapted for English)
 # ============================================================================
         analyze_button = gr.Button("Run Comprehensive Analysis", variant="primary")
     status_output = gr.Markdown(value="", visible=True)
+    # --- CHANGED: Added HTML output ---
+    html_output = gr.HTML(label="Visual Report")
+    json_output = gr.JSON(label="Raw JSON Data")
+    # --- CHANGED: Wrapper to return Status, HTML, and JSON ---
+    def run_analysis_with_status_visual(text, top_n):
         try:
             status = "🔄 Analyzing..."
+            yield status, "", {} # Clear outputs
             result = comprehensive_english_analysis(text, top_n)
+            # Generate HTML
+            html = _format_comprehensive_html(result)
             status = f"✅ Analysis complete! Found {len(result.get('lemma_deep_dive', {}))} lemmas."
+            yield status, html, result
         except Exception as e:
             error_status = f"❌ Error: {str(e)}"
+            yield error_status, f"<div style='color:red'>{str(e)}</div>", {"error": str(e), "traceback": traceback.format_exc()}
     analyze_button.click(
+        fn=run_analysis_with_status_visual,
         inputs=[text_input, top_n_number],
+        outputs=[status_output, html_output, json_output],
         api_name="comprehensive_analysis"
     )
     gr.Examples(
+        [["The cat sleeps on the table.", 3]],
         inputs=[text_input, top_n_number],
+        outputs=[status_output, html_output, json_output],
+        fn=run_analysis_with_status_visual,
         cache_examples=False
     )
             )
             engine_radio = gr.Radio(
+                label="Select Analysis Engine",
                 choices=[
                     ("Wiktionary (Default)", "wiktionary"),
                     ("HanTa (EN)", "hanta"),
         analyze_button = gr.Button("Analyze Word", variant="primary")
+    # --- CHANGED: Added HTML output component ---
+    html_output = gr.HTML(label="Visual Report")
+    json_output = gr.JSON(label="Raw JSON Data")
+    # --- CHANGED: Wrapper function to return both HTML and JSON ---
+    def run_word_visual(word, top_n, engine):
+        data = analyze_word_encyclopedia(word, top_n, engine, 'en')
+        html = _format_word_analysis_html(data)
+        return html, data
     analyze_button.click(
+        fn=run_word_visual, # Use wrapper
         inputs=[word_input, top_n_number, engine_radio],
+        outputs=[html_output, json_output], # Output to both
         api_name="analyze_word"
     )
     gr.Examples(
+        [["run", 3, "wiktionary"], ["water", 0, "wiktionary"]],
         inputs=[word_input, top_n_number, engine_radio],
+        outputs=[html_output, json_output],
+        fn=run_word_visual,
         cache_examples=False
     )