Spaces:

sitayeb
/

economic-rag-assistant

Running

App Files Files Community

sitayeb commited on Feb 27

Commit

74c6f3f

verified ·

1 Parent(s): c891db8

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -86

app.py CHANGED Viewed

@@ -96,60 +96,48 @@ LEGAL_KEYWORDS    = ["article","law","contract","clause","jurisdiction","court",
 ACADEMIC_KEYWORDS = ["abstract","methodology","hypothesis","conclusion","references","doi","journal"]
 # ============================================================
-# ✅ LOUGHRAN-MCDONALD ECONOMIC LEXICON (Extended)
 # ============================================================
 ECON_POSITIVE = [
-    # English
     "growth","recovery","surplus","improvement","stability","increase",
     "expansion","acceleration","resilience","upturn","robust","favorable",
     "strengthened","progress","rebound","optimistic","confidence","boom",
     "prosper","thrive","advance","gain","rise","positive","upward",
     "exceed","outperform","strong","healthy","dynamic","sustainable",
-    # French
     "croissance","reprise","amélioration","stabilité","excédent","hausse",
     "expansion","dynamique","favorable","progrès","rebond","solide",
-    # Arabic
     "تعافي","نمو","استقرار","فائض","تحسّن","ارتفاع","توسع","إيجابي",
     "تقدم","قوي","ازدهار","انتعاش","تحسين","قوة",
 ]
 ECON_NEGATIVE = [
-    # English
     "deficit","recession","inflation","decline","contraction","debt",
     "crisis","deterioration","slowdown","downturn","unemployment","pressure",
     "risk","vulnerability","shock","uncertainty","war","sanctions",
     "drought","collapse","default","volatile","instability","weak",
     "fragile","pessimistic","loss","shrink","fall","negative","downward",
     "slump","stagnation","turbulence","disruption","imbalance","burden",
-    # French
     "déficit","récession","crise","ralentissement","chômage","incertitude",
     "guerre","effondrement","instabilité","baisse","fragilité","pression",
-    # Arabic
     "عجز","تضخم","ركود","انكماش","أزمة","تدهور","بطالة","انخفاض",
     "ضغط","مخاطر","صدمة","عدم استقرار","هشاشة","ديون","عقوبات",
 ]
-# ✅ ECON_TRIGGER — متوازن إيجابي + سلبي + مؤشرات
 ECON_TRIGGER = [
-    # سلبي
     "deficit","risk","crisis","recession","shock","uncertainty",
     "slowdown","pressure","vulnerable","weak","deteriorat","downturn",
     "contraction","debt","unemployment","inflation","collapse","volatile",
     "instability","fragile","stagnation","disruption","sanctions","drought",
-    # إيجابي
     "growth","recovery","improvement","surplus","stable","expansion",
     "resilience","rebound","strengthened","acceleration","robust",
     "favorable","progress","increase","upturn","confidence","boom",
-    # مؤشرات محايدة
     "gdp","forecast","outlook","trade","fiscal","monetary","exchange",
     "interest","budget","revenue","expenditure","policy","reform",
-    # عربي
     "التضخم","الناتج","النمو","العجز","المخاطر","التوقعات","الميزانية",
-    # فرنسي
     "croissance","déficit","récession","prévision","taux","politique",
 ]
 def economic_lexicon_score(text: str) -> float:
-    """Loughran-McDonald Extended Lexicon → [-1, +1]"""
     text_lower = text.lower()
     pos   = sum(1 for w in ECON_POSITIVE if w in text_lower)
     neg   = sum(1 for w in ECON_NEGATIVE if w in text_lower)
@@ -182,11 +170,10 @@ def detect_document_type(texts: list) -> dict:
     }
 # ============================================================
-# ✅ ENSEMBLE: FinBERT (40%) + XLM-RoBERTa (30%) + Lexicon (30%)
 # ============================================================
 WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}
-# — FinBERT (Financial Text Expert) —
 print("⏳ Loading FinBERT (ProsusAI)...")
 try:
     finbert_pipe = pipeline(
@@ -203,7 +190,6 @@ except Exception as e:
     finbert_pipe = None
     FINBERT_OK   = False
-# — XLM-RoBERTa (Multilingual) —
 print("⏳ Loading XLM-RoBERTa...")
 try:
     xlm_pipe = pipeline(
@@ -226,7 +212,6 @@ def normalize_clf(raw):
     return raw if isinstance(raw, list) else [raw]
 def clf_finbert(text: str) -> float:
-    """FinBERT → [-1, +1] | labels: positive / negative / neutral"""
     if not FINBERT_OK or finbert_pipe is None:
         return 0.0
     try:
@@ -238,13 +223,11 @@ def clf_finbert(text: str) -> float:
         return 0.0
 def clf_xlm(text: str) -> float:
-    """XLM-RoBERTa → [-1, +1] | labels: LABEL_0/1/2 or positive/neutral/negative"""
     if not XLM_OK or xlm_pipe is None:
         return 0.0
     try:
         items = normalize_clf(xlm_pipe(text[:512]))
         d     = {r["label"]: float(r["score"]) for r in items}
-        # XLM labels: LABEL_0=neg, LABEL_1=neu, LABEL_2=pos
         pos   = d.get("LABEL_2", d.get("positive",  d.get("Positive",  0.0)))
         neg   = d.get("LABEL_0", d.get("negative",  d.get("Negative",  0.0)))
         return round(pos - neg, 4)
@@ -254,10 +237,8 @@ def clf_xlm(text: str) -> float:
 def sentiment_score_numeric(text: str) -> float:
     """
-    ✅ Weighted Ensemble:
-      40% FinBERT       (financial text expert)
-    + 30% XLM-RoBERTa  (multilingual: AR/FR/EN)
-    + 30% Loughran-McDonald Lexicon (economic terms)
     → [-1, +1]
     """
     fb  = clf_finbert(text)
@@ -271,7 +252,6 @@ def sentiment_score_numeric(text: str) -> float:
     )
 def run_sentiment(text: str):
-    """Ensemble sentiment → (label_str, confidence)"""
     score = sentiment_score_numeric(text)
     if score > 0.05:
         sent = "Positive 😊"
@@ -282,7 +262,6 @@ def run_sentiment(text: str):
     return sent, round(min(abs(score), 1.0), 4)
 def run_sentiment_detailed(text: str) -> str:
-    """Detailed breakdown of each model's contribution"""
     fb    = clf_finbert(text)
     xlm   = clf_xlm(text)
     lex   = economic_lexicon_score(text)
@@ -477,7 +456,7 @@ def build_index(files):
     for fname, info in PER_FILE_INFO.items():
         n    = sum(1 for m in KB_META if m["name"] == fname)
         yr   = str(info.get("year","N/A"))
-        yrb  = f"{yr} ✅" if yr not in ["None","N/A","None"] else "N/A ⚠️"
         badge = " 🟢" if info["is_economic"] else ""
         tbl  += f"| `{fname}` | {yrb} | {info['type']}{badge} | {info['confidence']:.0%} | {n} |\n"
@@ -560,31 +539,21 @@ def rag_retrieve(query, k=5, top_n=3):
         return []
 # ============================================================
-# ✅ SMART ECONOMIC CHUNK SAMPLER (Balanced)
 # ============================================================
 def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
-    """
-    Smart sampler:
-    1) فلتر chunks اقتصادية (ECON_TRIGGER)
-    2) عيّنة من بداية + وسط + نهاية التقرير
-    3) حد أقصى max_chunks
-    """
     n    = len(texts)
     econ = [t for t in texts if any(kw in t.lower() for kw in ECON_TRIGGER)]
     if len(econ) < 10:
         start = texts[:min(10, n)]
         mid   = texts[n//2-5 : n//2+5] if n > 20 else []
         end   = texts[-min(10, n):]
         econ  = list(dict.fromkeys(start + mid + end))
-    # عيّنة منتظمة من كل التقرير
     if len(econ) > max_chunks:
         step   = max(1, len(econ) // max_chunks)
         sample = econ[::step][:max_chunks]
     else:
         sample = econ
     return sample
 # ============================================================
@@ -629,7 +598,6 @@ def smart_answer(question, history):
     rag_context  = rag_context[:2000]
     has_good_rag = bool(results) and results[0]["sem"] >= 0.25
     answer_text  = llm_groq(question, rag_context, history, lang)
     if has_good_rag:
         src   = ", ".join(f"`{r['file']}` p.{r['page']}" for r in results)
         badge = f"\n\n📄 **{'المصدر' if lang=='ar' else 'Source'}:** {src}"
@@ -646,7 +614,6 @@ def smart_answer(question, history):
 def predict_with_rag(text):
     text = "" if text is None else str(text).strip()
     if not text: raise gr.Error("⚠️ Enter text first.")
     lang   = detect_lang(text)
     qterms = [t for t in re.findall(r"\w+", text.lower()) if len(t)>2]
@@ -680,10 +647,9 @@ def predict_with_rag(text):
             flag = "🇸🇦" if h["lang"]=="ar" else "🇺🇸"
             md  += f"- 🔑 **`{h['word']}`** → 📄 `{h['file']}` p.{h['page']} {flag}\n\n  > {h['sentence']}\n\n"
-        # ✅ Ensemble Sentiment Breakdown
-        detail = run_sentiment_detailed(text)
         sent, conf = run_sentiment(text)
-        md += f"---\n{detail}\n\n"
         md += "---\n## 📍 Exact Location\n\n"
         seen2 = set()
@@ -741,7 +707,7 @@ def get_worldbank_data(country_code, indicator, start_year, end_year):
         return pd.DataFrame()
 # ============================================================
-# ✅ YEARLY SENTIMENT INDEX (Ensemble)
 # ============================================================
 def build_doc_sentiment_index():
     if not KB_TEXTS or not KB_META: return None, None
@@ -828,7 +794,7 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
         global_mean = float(df_yearly["sentiment"].mean())
         merged["sentiment"] = merged["sentiment"].fillna(global_mean)
         has_yearly  = True
-        mode_msg    = "✅ **Yearly Ensemble Sentiment** (FinBERT+XLM+Lexicon)"
     else:
         global_sent = (
             float(pd.to_numeric(df_files["sentiment"], errors="coerce").mean())
@@ -839,13 +805,14 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
         has_yearly          = False
         mode_msg            = "⚠️ **Global Sentiment** — rename files like `WEO_2020.pdf`"
-    # ── 4) ✅ Normalize Sentiment → [-1, +1] ──────────────────
     if merged["sentiment"].std() > 1e-6:
-        scaler = MinMaxScaler(feature_range=(-1, 1))
         merged["sentiment"] = scaler.fit_transform(
             merged["sentiment"].values.reshape(-1, 1)
         ).flatten().round(4)
-    print(f"Sentiment normalized: {dict(zip(merged['year'], merged['sentiment']))}")
     # ── 5) Train / Test ───────────────────────────────────────
     series     = merged["value"].values.astype(float)
@@ -890,9 +857,12 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
     # Panel 1 — Forecast
     ax1 = axes[0]
     ax1.plot(years, series, "o-", color="#2196F3", label="Actual", lw=2, ms=5)
-    ax1.plot(test_years, pred_arima,   "s--", color="#FF5722", label="ARIMA(1,1,1)", lw=2)
-    ax1.plot(test_years, pred_sarimax, "^-.", color="#4CAF50", label="SARIMAX+Ensemble", lw=2)
-    ax1.axvline(x=years[split-1], color="gray", linestyle=":", alpha=0.7, label="Train│Test")
     ax1.set_title(
         f"📈 {target_var} — {country_code} "
         f"({'Yearly' if has_yearly else 'Global'} Ensemble Sentiment)",
@@ -904,15 +874,18 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
     # Panel 2 — Ensemble Sentiment Timeline
     ax2    = axes[1]
     s_vals = merged["sentiment"].values
-    s_clrs = ["#4CAF50" if s>0.05 else "#FF5722" if s<-0.05 else "#FFC107" for s in s_vals]
     ax2.bar(years, s_vals, color=s_clrs, edgecolor="white", width=0.6)
     ax2.axhline(y=0, color="black", lw=0.8)
     ax2.set_title(
-        f"📊 Ensemble Sentiment Index (FinBERT 40% + XLM 30% + Lexicon 30%)\n"
-        f"{'per-year' if has_yearly else 'global constant'} — normalized [-1,+1]",
         fontsize=10, fontweight="bold"
     )
-    ax2.set_xlabel("Year"); ax2.set_ylabel("Sentiment Score (normalized)")
     ax2.grid(True, alpha=0.3, axis="y")
     ax2.legend(handles=[
         Patch(facecolor="#4CAF50", label="Optimistic (>0.05)"),
@@ -920,21 +893,25 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
         Patch(facecolor="#FF5722", label="Pessimistic (<-0.05)"),
     ], loc="upper right", fontsize=8)
-    # Panel 3 — RMSE
     ax3  = axes[2]
     clrs = ["#FF5722" if rmse_a <= rmse_s else "#4CAF50",
             "#4CAF50" if rmse_s <= rmse_a else "#FF5722"]
     bars = ax3.bar(
         ["ARIMA(1,1,1)", "SARIMAX\n+Ensemble"],
-        [rmse_a, rmse_s], color=clrs, width=0.4, edgecolor="white"
     )
     for bar, val in zip(bars, [rmse_a, rmse_s]):
         ax3.text(
-            bar.get_x()+bar.get_width()/2, bar.get_height()+0.01,
-            f"{val:.4f}", ha="center", va="bottom", fontweight="bold", fontsize=11
         )
     ax3.set_title("RMSE Comparison (lower = better)", fontsize=11)
-    ax3.set_ylabel("RMSE"); ax3.grid(True, alpha=0.3, axis="y")
     plt.tight_layout(pad=3.0)
     img_path = "/tmp/forecast_plot.png"
@@ -945,15 +922,18 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
     sent_table = ""
     if df_files is not None and len(df_files) > 0:
         sent_table  = "\n---\n### 📄 Ensemble Sentiment per File\n\n"
-        sent_table += "| 📄 File | 📅 Year | 😊 Score | 📦 Chunks | Label |\n|---|---|---|---|---|\n"
         for _, row in df_files.iterrows():
-            yrb = f"{row['year']} ✅" if str(row['year']) not in ["N/A","None"] else "N/A ⚠️"
             sent_table += (
                 f"| `{row['file']}` | {yrb} "
                 f"| `{row['sentiment']:+.4f}` "
                 f"| {row['n_chunks']} | {row['label']} |\n"
             )
     arrow = "✅ Improved" if impr_rmse > 0 else "❌ No improvement"
     result_md = (
         f"## 📊 Forecast — Ensemble Sentiment\n\n"
@@ -966,9 +946,9 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
         f"| 🧪 Test       | **{len(test_y)}** samples |\n\n"
         f"---\n### 🏆 Model Comparison\n\n"
         f"| Model | RMSE | MAE | MAPE |\n|---|---|---|---|\n"
-        f"| ARIMA(1,1,1)          | `{rmse_a:.4f}` | `{mae_a:.4f}` | `{mape_a:.1f}%` |\n"
-        f"| SARIMAX+Ensemble      | `{rmse_s:.4f}` | `{mae_s:.4f}` | `{mape_s:.1f}%` |\n"
-        f"| **Improvement**       | **{impr_rmse:+.1f}%** | **{impr_mae:+.1f}%** | **{impr_mape:+.1f}%** |\n\n"
         f"**{arrow}** by adding Ensemble Sentiment Index.\n"
         f"{sent_table}"
     )
@@ -1009,9 +989,11 @@ def get_stats():
 def get_top_keywords():
     if not KB_TEXTS: return "_No files indexed yet._"
     all_words = re.findall(r"\b\w{4,}\b", " ".join(KB_TEXTS).lower())
-    stopwords = {"this","that","with","from","have","been","will","your",
-                 "they","their","which","when","what","also","more","some",
-                 "than","then","were","would","could","into","over","such"}
     top = Counter(w for w in all_words if w not in stopwords).most_common(20)
     return "### 🔑 Top Keywords\n\n" + "\n".join(f"- **{w}**: {c}" for w,c in top)
@@ -1070,7 +1052,7 @@ with gr.Blocks(
         gr.Markdown(
             "**Upload PDF / DOCX / TXT / CSV**\n\n"
             "> 💡 Name files like `WEO_2020.pdf` — year in filename required!\n"
-            "> ✅ Year must show ✅ not ⚠️ after building index."
         )
         files     = gr.File(label="📂 Files",
                             file_types=[".pdf",".txt",".csv",".docx"],
@@ -1087,36 +1069,42 @@ with gr.Blocks(
         load_btn.click(load_saved_index,               outputs=persist_status)
     with gr.Tab("🎭 2 · Sentiment + Search"):
-        inp     = gr.Textbox(lines=2,
-                             placeholder="Type text… | اكتب نصاً… | Saisissez un texte…",
-                             label="📝 Input (Ensemble: FinBERT + XLM + Lexicon)")
         run_btn = gr.Button("🔍 Analyze", variant="primary")
         with gr.Row():
             out_sent = gr.Textbox(label="🎭 Ensemble Sentiment", interactive=False)
             out_conf = gr.Number(label="📊 Score", precision=4)
         out_full = gr.Markdown()
-        run_btn.click(predict_with_rag, inputs=inp, outputs=[out_sent, out_conf, out_full])
         gr.Markdown("---")
         with gr.Row():
             dl_btn   = gr.Button("⬇️ Download Report", variant="secondary")
             rep_file = gr.File(label="report.md")
-        dl_btn.click(generate_report, inputs=[inp,out_sent,out_conf,out_full], outputs=rep_file)
     with gr.Tab("💬 3 · Smart Chatbot"):
         chatbot = gr.Chatbot(height=430, type="messages",
                              placeholder="Ask anything… / اسأل أي شيء…")
         msg = gr.Textbox(placeholder="Question…", label="💬")
         with gr.Row():
-            send_btn  = gr.Button("Send ➤",    variant="primary")
-            clear_btn = gr.Button("🗑️ Clear",  variant="secondary")
-            stats_btn = gr.Button("📊 Stats",  variant="secondary")
         stats_box = gr.Markdown(visible=False)
         with gr.Row():
             export_btn  = gr.Button("💾 Export Chat", variant="secondary")
             export_file = gr.File(label="chat_history.txt")
         msg.submit(chat_text,  inputs=[msg,chatbot], outputs=[msg,chatbot])
         send_btn.click(chat_text,  inputs=[msg,chatbot], outputs=[msg,chatbot])
-        clear_btn.click(lambda: ([],""), outputs=[chatbot,msg])
         stats_btn.click(
             lambda: (get_stats(), gr.update(visible=True)),
             outputs=[stats_box, stats_box]
@@ -1129,7 +1117,8 @@ with gr.Blocks(
         transcript = gr.Textbox(label="📝 Transcript", interactive=False)
         audio_out  = gr.Audio(label="🔊 Answer", type="filepath")
         voice_btn.click(
-            chat_voice, inputs=[audio_in, chatbot],
             outputs=[chatbot, audio_out, transcript]
         )
@@ -1150,7 +1139,7 @@ with gr.Blocks(
             "| 🏦 Sentiment 1   | **FinBERT** — ProsusAI (40%) |\n"
             "| 🌍 Sentiment 2   | **XLM-RoBERTa** — Cardiff NLP (30%) |\n"
             "| 📖 Sentiment 3   | **Loughran-McDonald Lexicon** (30%) |\n"
-            "| ⚡ Ensemble      | Weighted 40/30/30 + MinMaxNorm |\n"
             "| 🔍 Embeddings    | paraphrase-multilingual-MiniLM-L12-v2 |\n"
             "| 📊 Reranker      | cross-encoder/ms-marco-MiniLM-L-6-v2 |\n"
             "| 🤖 LLM           | Llama-3.3-70B via Groq |\n"
@@ -1175,18 +1164,27 @@ with gr.Blocks(
             "2. **Build Index** → verify Year = ✅\n"
             "3. Select country + variable + year range\n"
             "4. **Run Forecast** → compare RMSE\n\n"
-            "> 🏦 FinBERT(40%) + 🌍 XLM(30%) + 📖 Lexicon(30%) → normalized [-1,+1]"
         )
         with gr.Row():
-            country_input = gr.Textbox(value="DZ", label="🌍 Country Code",
-                                       placeholder="DZ / US / FR / MA / TN / EG")
             target_input  = gr.Dropdown(
-                choices=["Inflation (CPI %)","GDP Growth (%)","Unemployment (%)","Exchange Rate"],
-                value="Inflation (CPI %)", label="🎯 Target Variable"
             )
         with gr.Row():
             start_year = gr.Slider(minimum=2000, maximum=2020,
-                                   value=2010, step=1, label="📅 Start Year")
             end_year   = gr.Slider(minimum=2010, maximum=2024,
                                    value=2023, step=1, label="📅 End Year")
         forecast_btn = gr.Button("📈 Run Forecast", variant="primary", size="lg")
@@ -1199,4 +1197,7 @@ with gr.Blocks(
             outputs=[forecast_result, forecast_plot]
         )
 app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)

 ACADEMIC_KEYWORDS = ["abstract","methodology","hypothesis","conclusion","references","doi","journal"]
 # ============================================================
+# LOUGHRAN-MCDONALD EXTENDED LEXICON
 # ============================================================
 ECON_POSITIVE = [
     "growth","recovery","surplus","improvement","stability","increase",
     "expansion","acceleration","resilience","upturn","robust","favorable",
     "strengthened","progress","rebound","optimistic","confidence","boom",
     "prosper","thrive","advance","gain","rise","positive","upward",
     "exceed","outperform","strong","healthy","dynamic","sustainable",
     "croissance","reprise","amélioration","stabilité","excédent","hausse",
     "expansion","dynamique","favorable","progrès","rebond","solide",
     "تعافي","نمو","استقرار","فائض","تحسّن","ارتفاع","توسع","إيجابي",
     "تقدم","قوي","ازدهار","انتعاش","تحسين","قوة",
 ]
 ECON_NEGATIVE = [
     "deficit","recession","inflation","decline","contraction","debt",
     "crisis","deterioration","slowdown","downturn","unemployment","pressure",
     "risk","vulnerability","shock","uncertainty","war","sanctions",
     "drought","collapse","default","volatile","instability","weak",
     "fragile","pessimistic","loss","shrink","fall","negative","downward",
     "slump","stagnation","turbulence","disruption","imbalance","burden",
     "déficit","récession","crise","ralentissement","chômage","incertitude",
     "guerre","effondrement","instabilité","baisse","fragilité","pression",
     "عجز","تضخم","ركود","انكماش","أزمة","تدهور","بطالة","انخفاض",
     "ضغط","مخاطر","صدمة","عدم استقرار","هشاشة","ديون","عقوبات",
 ]
+# ECON_TRIGGER — متوازن إيجابي + سلبي + مؤشرات
 ECON_TRIGGER = [
     "deficit","risk","crisis","recession","shock","uncertainty",
     "slowdown","pressure","vulnerable","weak","deteriorat","downturn",
     "contraction","debt","unemployment","inflation","collapse","volatile",
     "instability","fragile","stagnation","disruption","sanctions","drought",
     "growth","recovery","improvement","surplus","stable","expansion",
     "resilience","rebound","strengthened","acceleration","robust",
     "favorable","progress","increase","upturn","confidence","boom",
     "gdp","forecast","outlook","trade","fiscal","monetary","exchange",
     "interest","budget","revenue","expenditure","policy","reform",
     "التضخم","الناتج","النمو","العجز","المخاطر","التوقعات","الميزانية",
     "croissance","déficit","récession","prévision","taux","politique",
 ]
 def economic_lexicon_score(text: str) -> float:
     text_lower = text.lower()
     pos   = sum(1 for w in ECON_POSITIVE if w in text_lower)
     neg   = sum(1 for w in ECON_NEGATIVE if w in text_lower)
     }
 # ============================================================
+# ENSEMBLE: FinBERT (40%) + XLM-RoBERTa (30%) + Lexicon (30%)
 # ============================================================
 WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}
 print("⏳ Loading FinBERT (ProsusAI)...")
 try:
     finbert_pipe = pipeline(
     finbert_pipe = None
     FINBERT_OK   = False
 print("⏳ Loading XLM-RoBERTa...")
 try:
     xlm_pipe = pipeline(
     return raw if isinstance(raw, list) else [raw]
 def clf_finbert(text: str) -> float:
     if not FINBERT_OK or finbert_pipe is None:
         return 0.0
     try:
         return 0.0
 def clf_xlm(text: str) -> float:
     if not XLM_OK or xlm_pipe is None:
         return 0.0
     try:
         items = normalize_clf(xlm_pipe(text[:512]))
         d     = {r["label"]: float(r["score"]) for r in items}
         pos   = d.get("LABEL_2", d.get("positive",  d.get("Positive",  0.0)))
         neg   = d.get("LABEL_0", d.get("negative",  d.get("Negative",  0.0)))
         return round(pos - neg, 4)
 def sentiment_score_numeric(text: str) -> float:
     """
+    Weighted Ensemble:
+      40% FinBERT + 30% XLM-RoBERTa + 30% Loughran-McDonald
     → [-1, +1]
     """
     fb  = clf_finbert(text)
     )
 def run_sentiment(text: str):
     score = sentiment_score_numeric(text)
     if score > 0.05:
         sent = "Positive 😊"
     return sent, round(min(abs(score), 1.0), 4)
 def run_sentiment_detailed(text: str) -> str:
     fb    = clf_finbert(text)
     xlm   = clf_xlm(text)
     lex   = economic_lexicon_score(text)
     for fname, info in PER_FILE_INFO.items():
         n    = sum(1 for m in KB_META if m["name"] == fname)
         yr   = str(info.get("year","N/A"))
+        yrb  = f"{yr} ✅" if yr not in ["None","N/A"] else "N/A ⚠️"
         badge = " 🟢" if info["is_economic"] else ""
         tbl  += f"| `{fname}` | {yrb} | {info['type']}{badge} | {info['confidence']:.0%} | {n} |\n"
         return []
 # ============================================================
+# SMART ECONOMIC CHUNK SAMPLER
 # ============================================================
 def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
     n    = len(texts)
     econ = [t for t in texts if any(kw in t.lower() for kw in ECON_TRIGGER)]
     if len(econ) < 10:
         start = texts[:min(10, n)]
         mid   = texts[n//2-5 : n//2+5] if n > 20 else []
         end   = texts[-min(10, n):]
         econ  = list(dict.fromkeys(start + mid + end))
     if len(econ) > max_chunks:
         step   = max(1, len(econ) // max_chunks)
         sample = econ[::step][:max_chunks]
     else:
         sample = econ
     return sample
 # ============================================================
     rag_context  = rag_context[:2000]
     has_good_rag = bool(results) and results[0]["sem"] >= 0.25
     answer_text  = llm_groq(question, rag_context, history, lang)
     if has_good_rag:
         src   = ", ".join(f"`{r['file']}` p.{r['page']}" for r in results)
         badge = f"\n\n📄 **{'المصدر' if lang=='ar' else 'Source'}:** {src}"
 def predict_with_rag(text):
     text = "" if text is None else str(text).strip()
     if not text: raise gr.Error("⚠️ Enter text first.")
     lang   = detect_lang(text)
     qterms = [t for t in re.findall(r"\w+", text.lower()) if len(t)>2]
             flag = "🇸🇦" if h["lang"]=="ar" else "🇺🇸"
             md  += f"- 🔑 **`{h['word']}`** → 📄 `{h['file']}` p.{h['page']} {flag}\n\n  > {h['sentence']}\n\n"
+        detail     = run_sentiment_detailed(text)
         sent, conf = run_sentiment(text)
+        md        += f"---\n{detail}\n\n"
         md += "---\n## 📍 Exact Location\n\n"
         seen2 = set()
         return pd.DataFrame()
 # ============================================================
+# YEARLY SENTIMENT INDEX
 # ============================================================
 def build_doc_sentiment_index():
     if not KB_TEXTS or not KB_META: return None, None
         global_mean = float(df_yearly["sentiment"].mean())
         merged["sentiment"] = merged["sentiment"].fillna(global_mean)
         has_yearly  = True
+        mode_msg    = "✅ **Yearly Ensemble Sentiment** (FinBERT 40%+XLM 30%+Lexicon 30%)"
     else:
         global_sent = (
             float(pd.to_numeric(df_files["sentiment"], errors="coerce").mean())
         has_yearly          = False
         mode_msg            = "⚠️ **Global Sentiment** — rename files like `WEO_2020.pdf`"
+    # ── 4) ✅ FIXED Normalization — feature_range=(-0.3, 0.3) ──
+    # يُلطّف تأثير Sentiment ويمنع over-prediction في Panel 1
     if merged["sentiment"].std() > 1e-6:
+        scaler = MinMaxScaler(feature_range=(-0.3, 0.3))
         merged["sentiment"] = scaler.fit_transform(
             merged["sentiment"].values.reshape(-1, 1)
         ).flatten().round(4)
+    print(f"Sentiment normalized [-0.3,+0.3]: {dict(zip(merged['year'], merged['sentiment']))}")
     # ── 5) Train / Test ───────────────────────────────────────
     series     = merged["value"].values.astype(float)
     # Panel 1 — Forecast
     ax1 = axes[0]
     ax1.plot(years, series, "o-", color="#2196F3", label="Actual", lw=2, ms=5)
+    ax1.plot(test_years, pred_arima,   "s--", color="#FF5722",
+             label="ARIMA(1,1,1)", lw=2)
+    ax1.plot(test_years, pred_sarimax, "^-.", color="#4CAF50",
+             label="SARIMAX+Ensemble", lw=2)
+    ax1.axvline(x=years[split-1], color="gray", linestyle=":",
+                alpha=0.7, label="Train│Test")
     ax1.set_title(
         f"📈 {target_var} — {country_code} "
         f"({'Yearly' if has_yearly else 'Global'} Ensemble Sentiment)",
     # Panel 2 — Ensemble Sentiment Timeline
     ax2    = axes[1]
     s_vals = merged["sentiment"].values
+    s_clrs = ["#4CAF50" if s>0.05 else "#FF5722" if s<-0.05 else "#FFC107"
+              for s in s_vals]
     ax2.bar(years, s_vals, color=s_clrs, edgecolor="white", width=0.6)
     ax2.axhline(y=0, color="black", lw=0.8)
     ax2.set_title(
+        f"📊 Ensemble Sentiment Index "
+        f"(FinBERT 40% + XLM 30% + Lexicon 30%)\n"
+        f"{'per-year' if has_yearly else 'global'} — normalized [-0.3, +0.3]",
         fontsize=10, fontweight="bold"
     )
+    ax2.set_xlabel("Year")
+    ax2.set_ylabel("Sentiment Score (normalized)")
     ax2.grid(True, alpha=0.3, axis="y")
     ax2.legend(handles=[
         Patch(facecolor="#4CAF50", label="Optimistic (>0.05)"),
         Patch(facecolor="#FF5722", label="Pessimistic (<-0.05)"),
     ], loc="upper right", fontsize=8)
+    # Panel 3 — RMSE Comparison
     ax3  = axes[2]
     clrs = ["#FF5722" if rmse_a <= rmse_s else "#4CAF50",
             "#4CAF50" if rmse_s <= rmse_a else "#FF5722"]
     bars = ax3.bar(
         ["ARIMA(1,1,1)", "SARIMAX\n+Ensemble"],
+        [rmse_a, rmse_s],
+        color=clrs, width=0.4, edgecolor="white"
     )
     for bar, val in zip(bars, [rmse_a, rmse_s]):
         ax3.text(
+            bar.get_x()+bar.get_width()/2,
+            bar.get_height()+0.01,
+            f"{val:.4f}",
+            ha="center", va="bottom", fontweight="bold", fontsize=11
         )
     ax3.set_title("RMSE Comparison (lower = better)", fontsize=11)
+    ax3.set_ylabel("RMSE")
+    ax3.grid(True, alpha=0.3, axis="y")
     plt.tight_layout(pad=3.0)
     img_path = "/tmp/forecast_plot.png"
     sent_table = ""
     if df_files is not None and len(df_files) > 0:
         sent_table  = "\n---\n### 📄 Ensemble Sentiment per File\n\n"
+        sent_table += "| 📄 File | 📅 Year | 😊 Score | 📦 Chunks | Label |\n"
+        sent_table += "|---|---|---|---|---|\n"
         for _, row in df_files.iterrows():
+            yrb = (f"{row['year']} ✅"
+                   if str(row['year']) not in ["N/A","None"] else "N/A ⚠️")
             sent_table += (
                 f"| `{row['file']}` | {yrb} "
                 f"| `{row['sentiment']:+.4f}` "
                 f"| {row['n_chunks']} | {row['label']} |\n"
             )
+    # ── 10) Result Text ────────────────────────────────────────
     arrow = "✅ Improved" if impr_rmse > 0 else "❌ No improvement"
     result_md = (
         f"## 📊 Forecast — Ensemble Sentiment\n\n"
         f"| 🧪 Test       | **{len(test_y)}** samples |\n\n"
         f"---\n### 🏆 Model Comparison\n\n"
         f"| Model | RMSE | MAE | MAPE |\n|---|---|---|---|\n"
+        f"| ARIMA(1,1,1)      | `{rmse_a:.4f}` | `{mae_a:.4f}` | `{mape_a:.1f}%` |\n"
+        f"| SARIMAX+Ensemble  | `{rmse_s:.4f}` | `{mae_s:.4f}` | `{mape_s:.1f}%` |\n"
+        f"| **Improvement**   | **{impr_rmse:+.1f}%** | **{impr_mae:+.1f}%** | **{impr_mape:+.1f}%** |\n\n"
         f"**{arrow}** by adding Ensemble Sentiment Index.\n"
         f"{sent_table}"
     )
 def get_top_keywords():
     if not KB_TEXTS: return "_No files indexed yet._"
     all_words = re.findall(r"\b\w{4,}\b", " ".join(KB_TEXTS).lower())
+    stopwords = {
+        "this","that","with","from","have","been","will","your",
+        "they","their","which","when","what","also","more","some",
+        "than","then","were","would","could","into","over","such"
+    }
     top = Counter(w for w in all_words if w not in stopwords).most_common(20)
     return "### 🔑 Top Keywords\n\n" + "\n".join(f"- **{w}**: {c}" for w,c in top)
         gr.Markdown(
             "**Upload PDF / DOCX / TXT / CSV**\n\n"
             "> 💡 Name files like `WEO_2020.pdf` — year in filename required!\n"
+            "> ✅ Year must show ✅ not ⚠️"
         )
         files     = gr.File(label="📂 Files",
                             file_types=[".pdf",".txt",".csv",".docx"],
         load_btn.click(load_saved_index,               outputs=persist_status)
     with gr.Tab("🎭 2 · Sentiment + Search"):
+        inp     = gr.Textbox(
+            lines=2,
+            placeholder="Type text… | اكتب نصاً… | Saisissez un texte…",
+            label="📝 Input (Ensemble: FinBERT 40% + XLM 30% + Lexicon 30%)"
+        )
         run_btn = gr.Button("🔍 Analyze", variant="primary")
         with gr.Row():
             out_sent = gr.Textbox(label="🎭 Ensemble Sentiment", interactive=False)
             out_conf = gr.Number(label="📊 Score", precision=4)
         out_full = gr.Markdown()
+        run_btn.click(predict_with_rag,
+                      inputs=inp,
+                      outputs=[out_sent, out_conf, out_full])
         gr.Markdown("---")
         with gr.Row():
             dl_btn   = gr.Button("⬇️ Download Report", variant="secondary")
             rep_file = gr.File(label="report.md")
+        dl_btn.click(generate_report,
+                     inputs=[inp, out_sent, out_conf, out_full],
+                     outputs=rep_file)
     with gr.Tab("💬 3 · Smart Chatbot"):
         chatbot = gr.Chatbot(height=430, type="messages",
                              placeholder="Ask anything… / اسأل أي شيء…")
         msg = gr.Textbox(placeholder="Question…", label="💬")
         with gr.Row():
+            send_btn  = gr.Button("Send ➤",   variant="primary")
+            clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+            stats_btn = gr.Button("📊 Stats", variant="secondary")
         stats_box = gr.Markdown(visible=False)
         with gr.Row():
             export_btn  = gr.Button("💾 Export Chat", variant="secondary")
             export_file = gr.File(label="chat_history.txt")
         msg.submit(chat_text,  inputs=[msg,chatbot], outputs=[msg,chatbot])
         send_btn.click(chat_text,  inputs=[msg,chatbot], outputs=[msg,chatbot])
+        clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
         stats_btn.click(
             lambda: (get_stats(), gr.update(visible=True)),
             outputs=[stats_box, stats_box]
         transcript = gr.Textbox(label="📝 Transcript", interactive=False)
         audio_out  = gr.Audio(label="🔊 Answer", type="filepath")
         voice_btn.click(
+            chat_voice,
+            inputs=[audio_in, chatbot],
             outputs=[chatbot, audio_out, transcript]
         )
             "| 🏦 Sentiment 1   | **FinBERT** — ProsusAI (40%) |\n"
             "| 🌍 Sentiment 2   | **XLM-RoBERTa** — Cardiff NLP (30%) |\n"
             "| 📖 Sentiment 3   | **Loughran-McDonald Lexicon** (30%) |\n"
+            "| ⚡ Ensemble      | Weighted 40/30/30 + MinMax[-0.3,+0.3] |\n"
             "| 🔍 Embeddings    | paraphrase-multilingual-MiniLM-L12-v2 |\n"
             "| 📊 Reranker      | cross-encoder/ms-marco-MiniLM-L-6-v2 |\n"
             "| 🤖 LLM           | Llama-3.3-70B via Groq |\n"
             "2. **Build Index** → verify Year = ✅\n"
             "3. Select country + variable + year range\n"
             "4. **Run Forecast** → compare RMSE\n\n"
+            "> 🏦 FinBERT(40%) + 🌍 XLM(30%) + 📖 Lexicon(30%)\n"
+            "> Normalized to **[-0.3, +0.3]** for smooth forecasting"
         )
         with gr.Row():
+            country_input = gr.Textbox(
+                value="DZ", label="🌍 Country Code",
+                placeholder="DZ / US / FR / MA / TN / EG"
+            )
             target_input  = gr.Dropdown(
+                choices=[
+                    "Inflation (CPI %)",
+                    "GDP Growth (%)",
+                    "Unemployment (%)",
+                    "Exchange Rate"
+                ],
+                value="Inflation (CPI %)",
+                label="🎯 Target Variable"
             )
         with gr.Row():
             start_year = gr.Slider(minimum=2000, maximum=2020,
+                                   value=2000, step=1, label="📅 Start Year")
             end_year   = gr.Slider(minimum=2010, maximum=2024,
                                    value=2023, step=1, label="📅 End Year")
         forecast_btn = gr.Button("📈 Run Forecast", variant="primary", size="lg")
             outputs=[forecast_result, forecast_plot]
         )
+# ============================================================
+# LAUNCH
+# ============================================================
 app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)