sitayeb commited on
Commit
74c6f3f
·
verified ·
1 Parent(s): c891db8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -86
app.py CHANGED
@@ -96,60 +96,48 @@ LEGAL_KEYWORDS = ["article","law","contract","clause","jurisdiction","court",
96
  ACADEMIC_KEYWORDS = ["abstract","methodology","hypothesis","conclusion","references","doi","journal"]
97
 
98
  # ============================================================
99
- # LOUGHRAN-MCDONALD ECONOMIC LEXICON (Extended)
100
  # ============================================================
101
  ECON_POSITIVE = [
102
- # English
103
  "growth","recovery","surplus","improvement","stability","increase",
104
  "expansion","acceleration","resilience","upturn","robust","favorable",
105
  "strengthened","progress","rebound","optimistic","confidence","boom",
106
  "prosper","thrive","advance","gain","rise","positive","upward",
107
  "exceed","outperform","strong","healthy","dynamic","sustainable",
108
- # French
109
  "croissance","reprise","amélioration","stabilité","excédent","hausse",
110
  "expansion","dynamique","favorable","progrès","rebond","solide",
111
- # Arabic
112
  "تعافي","نمو","استقرار","فائض","تحسّن","ارتفاع","توسع","إيجابي",
113
  "تقدم","قوي","ازدهار","انتعاش","تحسين","قوة",
114
  ]
115
  ECON_NEGATIVE = [
116
- # English
117
  "deficit","recession","inflation","decline","contraction","debt",
118
  "crisis","deterioration","slowdown","downturn","unemployment","pressure",
119
  "risk","vulnerability","shock","uncertainty","war","sanctions",
120
  "drought","collapse","default","volatile","instability","weak",
121
  "fragile","pessimistic","loss","shrink","fall","negative","downward",
122
  "slump","stagnation","turbulence","disruption","imbalance","burden",
123
- # French
124
  "déficit","récession","crise","ralentissement","chômage","incertitude",
125
  "guerre","effondrement","instabilité","baisse","fragilité","pression",
126
- # Arabic
127
  "عجز","تضخم","ركود","انكماش","أزمة","تدهور","بطالة","انخفاض",
128
  "ضغط","مخاطر","صدمة","عدم استقرار","هشاشة","ديون","عقوبات",
129
  ]
130
 
131
- # ECON_TRIGGER — متوازن إيجابي + سلبي + مؤشرات
132
  ECON_TRIGGER = [
133
- # سلبي
134
  "deficit","risk","crisis","recession","shock","uncertainty",
135
  "slowdown","pressure","vulnerable","weak","deteriorat","downturn",
136
  "contraction","debt","unemployment","inflation","collapse","volatile",
137
  "instability","fragile","stagnation","disruption","sanctions","drought",
138
- # إيجابي
139
  "growth","recovery","improvement","surplus","stable","expansion",
140
  "resilience","rebound","strengthened","acceleration","robust",
141
  "favorable","progress","increase","upturn","confidence","boom",
142
- # مؤشرات محايدة
143
  "gdp","forecast","outlook","trade","fiscal","monetary","exchange",
144
  "interest","budget","revenue","expenditure","policy","reform",
145
- # عربي
146
  "التضخم","الناتج","النمو","العجز","المخاطر","التوقعات","الميزانية",
147
- # فرنسي
148
  "croissance","déficit","récession","prévision","taux","politique",
149
  ]
150
 
151
  def economic_lexicon_score(text: str) -> float:
152
- """Loughran-McDonald Extended Lexicon → [-1, +1]"""
153
  text_lower = text.lower()
154
  pos = sum(1 for w in ECON_POSITIVE if w in text_lower)
155
  neg = sum(1 for w in ECON_NEGATIVE if w in text_lower)
@@ -182,11 +170,10 @@ def detect_document_type(texts: list) -> dict:
182
  }
183
 
184
  # ============================================================
185
- # ENSEMBLE: FinBERT (40%) + XLM-RoBERTa (30%) + Lexicon (30%)
186
  # ============================================================
187
  WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}
188
 
189
- # — FinBERT (Financial Text Expert) —
190
  print("⏳ Loading FinBERT (ProsusAI)...")
191
  try:
192
  finbert_pipe = pipeline(
@@ -203,7 +190,6 @@ except Exception as e:
203
  finbert_pipe = None
204
  FINBERT_OK = False
205
 
206
- # — XLM-RoBERTa (Multilingual) —
207
  print("⏳ Loading XLM-RoBERTa...")
208
  try:
209
  xlm_pipe = pipeline(
@@ -226,7 +212,6 @@ def normalize_clf(raw):
226
  return raw if isinstance(raw, list) else [raw]
227
 
228
  def clf_finbert(text: str) -> float:
229
- """FinBERT → [-1, +1] | labels: positive / negative / neutral"""
230
  if not FINBERT_OK or finbert_pipe is None:
231
  return 0.0
232
  try:
@@ -238,13 +223,11 @@ def clf_finbert(text: str) -> float:
238
  return 0.0
239
 
240
  def clf_xlm(text: str) -> float:
241
- """XLM-RoBERTa → [-1, +1] | labels: LABEL_0/1/2 or positive/neutral/negative"""
242
  if not XLM_OK or xlm_pipe is None:
243
  return 0.0
244
  try:
245
  items = normalize_clf(xlm_pipe(text[:512]))
246
  d = {r["label"]: float(r["score"]) for r in items}
247
- # XLM labels: LABEL_0=neg, LABEL_1=neu, LABEL_2=pos
248
  pos = d.get("LABEL_2", d.get("positive", d.get("Positive", 0.0)))
249
  neg = d.get("LABEL_0", d.get("negative", d.get("Negative", 0.0)))
250
  return round(pos - neg, 4)
@@ -254,10 +237,8 @@ def clf_xlm(text: str) -> float:
254
 
255
  def sentiment_score_numeric(text: str) -> float:
256
  """
257
- Weighted Ensemble:
258
- 40% FinBERT (financial text expert)
259
- + 30% XLM-RoBERTa (multilingual: AR/FR/EN)
260
- + 30% Loughran-McDonald Lexicon (economic terms)
261
  → [-1, +1]
262
  """
263
  fb = clf_finbert(text)
@@ -271,7 +252,6 @@ def sentiment_score_numeric(text: str) -> float:
271
  )
272
 
273
  def run_sentiment(text: str):
274
- """Ensemble sentiment → (label_str, confidence)"""
275
  score = sentiment_score_numeric(text)
276
  if score > 0.05:
277
  sent = "Positive 😊"
@@ -282,7 +262,6 @@ def run_sentiment(text: str):
282
  return sent, round(min(abs(score), 1.0), 4)
283
 
284
  def run_sentiment_detailed(text: str) -> str:
285
- """Detailed breakdown of each model's contribution"""
286
  fb = clf_finbert(text)
287
  xlm = clf_xlm(text)
288
  lex = economic_lexicon_score(text)
@@ -477,7 +456,7 @@ def build_index(files):
477
  for fname, info in PER_FILE_INFO.items():
478
  n = sum(1 for m in KB_META if m["name"] == fname)
479
  yr = str(info.get("year","N/A"))
480
- yrb = f"{yr} ✅" if yr not in ["None","N/A","None"] else "N/A ⚠️"
481
  badge = " 🟢" if info["is_economic"] else ""
482
  tbl += f"| `{fname}` | {yrb} | {info['type']}{badge} | {info['confidence']:.0%} | {n} |\n"
483
 
@@ -560,31 +539,21 @@ def rag_retrieve(query, k=5, top_n=3):
560
  return []
561
 
562
  # ============================================================
563
- # SMART ECONOMIC CHUNK SAMPLER (Balanced)
564
  # ============================================================
565
  def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
566
- """
567
- Smart sampler:
568
- 1) فلتر chunks اقتصادية (ECON_TRIGGER)
569
- 2) عيّنة من بداية + وسط + نهاية التقرير
570
- 3) حد أقصى max_chunks
571
- """
572
  n = len(texts)
573
  econ = [t for t in texts if any(kw in t.lower() for kw in ECON_TRIGGER)]
574
-
575
  if len(econ) < 10:
576
  start = texts[:min(10, n)]
577
  mid = texts[n//2-5 : n//2+5] if n > 20 else []
578
  end = texts[-min(10, n):]
579
  econ = list(dict.fromkeys(start + mid + end))
580
-
581
- # عيّنة منتظمة من كل التقرير
582
  if len(econ) > max_chunks:
583
  step = max(1, len(econ) // max_chunks)
584
  sample = econ[::step][:max_chunks]
585
  else:
586
  sample = econ
587
-
588
  return sample
589
 
590
  # ============================================================
@@ -629,7 +598,6 @@ def smart_answer(question, history):
629
  rag_context = rag_context[:2000]
630
  has_good_rag = bool(results) and results[0]["sem"] >= 0.25
631
  answer_text = llm_groq(question, rag_context, history, lang)
632
-
633
  if has_good_rag:
634
  src = ", ".join(f"`{r['file']}` p.{r['page']}" for r in results)
635
  badge = f"\n\n📄 **{'المصدر' if lang=='ar' else 'Source'}:** {src}"
@@ -646,7 +614,6 @@ def smart_answer(question, history):
646
  def predict_with_rag(text):
647
  text = "" if text is None else str(text).strip()
648
  if not text: raise gr.Error("⚠️ Enter text first.")
649
-
650
  lang = detect_lang(text)
651
  qterms = [t for t in re.findall(r"\w+", text.lower()) if len(t)>2]
652
 
@@ -680,10 +647,9 @@ def predict_with_rag(text):
680
  flag = "🇸🇦" if h["lang"]=="ar" else "🇺🇸"
681
  md += f"- 🔑 **`{h['word']}`** → 📄 `{h['file']}` p.{h['page']} {flag}\n\n > {h['sentence']}\n\n"
682
 
683
- # ✅ Ensemble Sentiment Breakdown
684
- detail = run_sentiment_detailed(text)
685
  sent, conf = run_sentiment(text)
686
- md += f"---\n{detail}\n\n"
687
 
688
  md += "---\n## 📍 Exact Location\n\n"
689
  seen2 = set()
@@ -741,7 +707,7 @@ def get_worldbank_data(country_code, indicator, start_year, end_year):
741
  return pd.DataFrame()
742
 
743
  # ============================================================
744
- # YEARLY SENTIMENT INDEX (Ensemble)
745
  # ============================================================
746
  def build_doc_sentiment_index():
747
  if not KB_TEXTS or not KB_META: return None, None
@@ -828,7 +794,7 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
828
  global_mean = float(df_yearly["sentiment"].mean())
829
  merged["sentiment"] = merged["sentiment"].fillna(global_mean)
830
  has_yearly = True
831
- mode_msg = "✅ **Yearly Ensemble Sentiment** (FinBERT+XLM+Lexicon)"
832
  else:
833
  global_sent = (
834
  float(pd.to_numeric(df_files["sentiment"], errors="coerce").mean())
@@ -839,13 +805,14 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
839
  has_yearly = False
840
  mode_msg = "⚠️ **Global Sentiment** — rename files like `WEO_2020.pdf`"
841
 
842
- # ── 4) ✅ Normalize Sentiment [-1, +1] ──────────────────
 
843
  if merged["sentiment"].std() > 1e-6:
844
- scaler = MinMaxScaler(feature_range=(-1, 1))
845
  merged["sentiment"] = scaler.fit_transform(
846
  merged["sentiment"].values.reshape(-1, 1)
847
  ).flatten().round(4)
848
- print(f"Sentiment normalized: {dict(zip(merged['year'], merged['sentiment']))}")
849
 
850
  # ── 5) Train / Test ───────────────────────────────────────
851
  series = merged["value"].values.astype(float)
@@ -890,9 +857,12 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
890
  # Panel 1 — Forecast
891
  ax1 = axes[0]
892
  ax1.plot(years, series, "o-", color="#2196F3", label="Actual", lw=2, ms=5)
893
- ax1.plot(test_years, pred_arima, "s--", color="#FF5722", label="ARIMA(1,1,1)", lw=2)
894
- ax1.plot(test_years, pred_sarimax, "^-.", color="#4CAF50", label="SARIMAX+Ensemble", lw=2)
895
- ax1.axvline(x=years[split-1], color="gray", linestyle=":", alpha=0.7, label="Train│Test")
 
 
 
896
  ax1.set_title(
897
  f"📈 {target_var} — {country_code} "
898
  f"({'Yearly' if has_yearly else 'Global'} Ensemble Sentiment)",
@@ -904,15 +874,18 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
904
  # Panel 2 — Ensemble Sentiment Timeline
905
  ax2 = axes[1]
906
  s_vals = merged["sentiment"].values
907
- s_clrs = ["#4CAF50" if s>0.05 else "#FF5722" if s<-0.05 else "#FFC107" for s in s_vals]
 
908
  ax2.bar(years, s_vals, color=s_clrs, edgecolor="white", width=0.6)
909
  ax2.axhline(y=0, color="black", lw=0.8)
910
  ax2.set_title(
911
- f"📊 Ensemble Sentiment Index (FinBERT 40% + XLM 30% + Lexicon 30%)\n"
912
- f"{'per-year' if has_yearly else 'global constant'} normalized [-1,+1]",
 
913
  fontsize=10, fontweight="bold"
914
  )
915
- ax2.set_xlabel("Year"); ax2.set_ylabel("Sentiment Score (normalized)")
 
916
  ax2.grid(True, alpha=0.3, axis="y")
917
  ax2.legend(handles=[
918
  Patch(facecolor="#4CAF50", label="Optimistic (>0.05)"),
@@ -920,21 +893,25 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
920
  Patch(facecolor="#FF5722", label="Pessimistic (<-0.05)"),
921
  ], loc="upper right", fontsize=8)
922
 
923
- # Panel 3 — RMSE
924
  ax3 = axes[2]
925
  clrs = ["#FF5722" if rmse_a <= rmse_s else "#4CAF50",
926
  "#4CAF50" if rmse_s <= rmse_a else "#FF5722"]
927
  bars = ax3.bar(
928
  ["ARIMA(1,1,1)", "SARIMAX\n+Ensemble"],
929
- [rmse_a, rmse_s], color=clrs, width=0.4, edgecolor="white"
 
930
  )
931
  for bar, val in zip(bars, [rmse_a, rmse_s]):
932
  ax3.text(
933
- bar.get_x()+bar.get_width()/2, bar.get_height()+0.01,
934
- f"{val:.4f}", ha="center", va="bottom", fontweight="bold", fontsize=11
 
 
935
  )
936
  ax3.set_title("RMSE Comparison (lower = better)", fontsize=11)
937
- ax3.set_ylabel("RMSE"); ax3.grid(True, alpha=0.3, axis="y")
 
938
 
939
  plt.tight_layout(pad=3.0)
940
  img_path = "/tmp/forecast_plot.png"
@@ -945,15 +922,18 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
945
  sent_table = ""
946
  if df_files is not None and len(df_files) > 0:
947
  sent_table = "\n---\n### 📄 Ensemble Sentiment per File\n\n"
948
- sent_table += "| 📄 File | 📅 Year | 😊 Score | 📦 Chunks | Label |\n|---|---|---|---|---|\n"
 
949
  for _, row in df_files.iterrows():
950
- yrb = f"{row['year']} ✅" if str(row['year']) not in ["N/A","None"] else "N/A ⚠️"
 
951
  sent_table += (
952
  f"| `{row['file']}` | {yrb} "
953
  f"| `{row['sentiment']:+.4f}` "
954
  f"| {row['n_chunks']} | {row['label']} |\n"
955
  )
956
 
 
957
  arrow = "✅ Improved" if impr_rmse > 0 else "❌ No improvement"
958
  result_md = (
959
  f"## 📊 Forecast — Ensemble Sentiment\n\n"
@@ -966,9 +946,9 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
966
  f"| 🧪 Test | **{len(test_y)}** samples |\n\n"
967
  f"---\n### 🏆 Model Comparison\n\n"
968
  f"| Model | RMSE | MAE | MAPE |\n|---|---|---|---|\n"
969
- f"| ARIMA(1,1,1) | `{rmse_a:.4f}` | `{mae_a:.4f}` | `{mape_a:.1f}%` |\n"
970
- f"| SARIMAX+Ensemble | `{rmse_s:.4f}` | `{mae_s:.4f}` | `{mape_s:.1f}%` |\n"
971
- f"| **Improvement** | **{impr_rmse:+.1f}%** | **{impr_mae:+.1f}%** | **{impr_mape:+.1f}%** |\n\n"
972
  f"**{arrow}** by adding Ensemble Sentiment Index.\n"
973
  f"{sent_table}"
974
  )
@@ -1009,9 +989,11 @@ def get_stats():
1009
  def get_top_keywords():
1010
  if not KB_TEXTS: return "_No files indexed yet._"
1011
  all_words = re.findall(r"\b\w{4,}\b", " ".join(KB_TEXTS).lower())
1012
- stopwords = {"this","that","with","from","have","been","will","your",
1013
- "they","their","which","when","what","also","more","some",
1014
- "than","then","were","would","could","into","over","such"}
 
 
1015
  top = Counter(w for w in all_words if w not in stopwords).most_common(20)
1016
  return "### 🔑 Top Keywords\n\n" + "\n".join(f"- **{w}**: {c}" for w,c in top)
1017
 
@@ -1070,7 +1052,7 @@ with gr.Blocks(
1070
  gr.Markdown(
1071
  "**Upload PDF / DOCX / TXT / CSV**\n\n"
1072
  "> 💡 Name files like `WEO_2020.pdf` — year in filename required!\n"
1073
- "> ✅ Year must show ✅ not ⚠️ after building index."
1074
  )
1075
  files = gr.File(label="📂 Files",
1076
  file_types=[".pdf",".txt",".csv",".docx"],
@@ -1087,36 +1069,42 @@ with gr.Blocks(
1087
  load_btn.click(load_saved_index, outputs=persist_status)
1088
 
1089
  with gr.Tab("🎭 2 · Sentiment + Search"):
1090
- inp = gr.Textbox(lines=2,
1091
- placeholder="Type text… | اكتب نصاً… | Saisissez un texte…",
1092
- label="📝 Input (Ensemble: FinBERT + XLM + Lexicon)")
 
 
1093
  run_btn = gr.Button("🔍 Analyze", variant="primary")
1094
  with gr.Row():
1095
  out_sent = gr.Textbox(label="🎭 Ensemble Sentiment", interactive=False)
1096
  out_conf = gr.Number(label="📊 Score", precision=4)
1097
  out_full = gr.Markdown()
1098
- run_btn.click(predict_with_rag, inputs=inp, outputs=[out_sent, out_conf, out_full])
 
 
1099
  gr.Markdown("---")
1100
  with gr.Row():
1101
  dl_btn = gr.Button("⬇️ Download Report", variant="secondary")
1102
  rep_file = gr.File(label="report.md")
1103
- dl_btn.click(generate_report, inputs=[inp,out_sent,out_conf,out_full], outputs=rep_file)
 
 
1104
 
1105
  with gr.Tab("💬 3 · Smart Chatbot"):
1106
  chatbot = gr.Chatbot(height=430, type="messages",
1107
  placeholder="Ask anything… / اسأل أي شيء…")
1108
  msg = gr.Textbox(placeholder="Question…", label="💬")
1109
  with gr.Row():
1110
- send_btn = gr.Button("Send ➤", variant="primary")
1111
- clear_btn = gr.Button("🗑️ Clear", variant="secondary")
1112
- stats_btn = gr.Button("📊 Stats", variant="secondary")
1113
  stats_box = gr.Markdown(visible=False)
1114
  with gr.Row():
1115
  export_btn = gr.Button("💾 Export Chat", variant="secondary")
1116
  export_file = gr.File(label="chat_history.txt")
1117
  msg.submit(chat_text, inputs=[msg,chatbot], outputs=[msg,chatbot])
1118
  send_btn.click(chat_text, inputs=[msg,chatbot], outputs=[msg,chatbot])
1119
- clear_btn.click(lambda: ([],""), outputs=[chatbot,msg])
1120
  stats_btn.click(
1121
  lambda: (get_stats(), gr.update(visible=True)),
1122
  outputs=[stats_box, stats_box]
@@ -1129,7 +1117,8 @@ with gr.Blocks(
1129
  transcript = gr.Textbox(label="📝 Transcript", interactive=False)
1130
  audio_out = gr.Audio(label="🔊 Answer", type="filepath")
1131
  voice_btn.click(
1132
- chat_voice, inputs=[audio_in, chatbot],
 
1133
  outputs=[chatbot, audio_out, transcript]
1134
  )
1135
 
@@ -1150,7 +1139,7 @@ with gr.Blocks(
1150
  "| 🏦 Sentiment 1 | **FinBERT** — ProsusAI (40%) |\n"
1151
  "| 🌍 Sentiment 2 | **XLM-RoBERTa** — Cardiff NLP (30%) |\n"
1152
  "| 📖 Sentiment 3 | **Loughran-McDonald Lexicon** (30%) |\n"
1153
- "| ⚡ Ensemble | Weighted 40/30/30 + MinMaxNorm |\n"
1154
  "| 🔍 Embeddings | paraphrase-multilingual-MiniLM-L12-v2 |\n"
1155
  "| 📊 Reranker | cross-encoder/ms-marco-MiniLM-L-6-v2 |\n"
1156
  "| 🤖 LLM | Llama-3.3-70B via Groq |\n"
@@ -1175,18 +1164,27 @@ with gr.Blocks(
1175
  "2. **Build Index** → verify Year = ✅\n"
1176
  "3. Select country + variable + year range\n"
1177
  "4. **Run Forecast** → compare RMSE\n\n"
1178
- "> 🏦 FinBERT(40%) + 🌍 XLM(30%) + 📖 Lexicon(30%) → normalized [-1,+1]"
 
1179
  )
1180
  with gr.Row():
1181
- country_input = gr.Textbox(value="DZ", label="🌍 Country Code",
1182
- placeholder="DZ / US / FR / MA / TN / EG")
 
 
1183
  target_input = gr.Dropdown(
1184
- choices=["Inflation (CPI %)","GDP Growth (%)","Unemployment (%)","Exchange Rate"],
1185
- value="Inflation (CPI %)", label="🎯 Target Variable"
 
 
 
 
 
 
1186
  )
1187
  with gr.Row():
1188
  start_year = gr.Slider(minimum=2000, maximum=2020,
1189
- value=2010, step=1, label="📅 Start Year")
1190
  end_year = gr.Slider(minimum=2010, maximum=2024,
1191
  value=2023, step=1, label="📅 End Year")
1192
  forecast_btn = gr.Button("📈 Run Forecast", variant="primary", size="lg")
@@ -1199,4 +1197,7 @@ with gr.Blocks(
1199
  outputs=[forecast_result, forecast_plot]
1200
  )
1201
 
 
 
 
1202
  app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
 
96
  ACADEMIC_KEYWORDS = ["abstract","methodology","hypothesis","conclusion","references","doi","journal"]
97
 
98
  # ============================================================
99
+ # LOUGHRAN-MCDONALD EXTENDED LEXICON
100
  # ============================================================
101
  ECON_POSITIVE = [
 
102
  "growth","recovery","surplus","improvement","stability","increase",
103
  "expansion","acceleration","resilience","upturn","robust","favorable",
104
  "strengthened","progress","rebound","optimistic","confidence","boom",
105
  "prosper","thrive","advance","gain","rise","positive","upward",
106
  "exceed","outperform","strong","healthy","dynamic","sustainable",
 
107
  "croissance","reprise","amélioration","stabilité","excédent","hausse",
108
  "expansion","dynamique","favorable","progrès","rebond","solide",
 
109
  "تعافي","نمو","استقرار","فائض","تحسّن","ارتفاع","توسع","إيجابي",
110
  "تقدم","قوي","ازدهار","انتعاش","تحسين","قوة",
111
  ]
112
  ECON_NEGATIVE = [
 
113
  "deficit","recession","inflation","decline","contraction","debt",
114
  "crisis","deterioration","slowdown","downturn","unemployment","pressure",
115
  "risk","vulnerability","shock","uncertainty","war","sanctions",
116
  "drought","collapse","default","volatile","instability","weak",
117
  "fragile","pessimistic","loss","shrink","fall","negative","downward",
118
  "slump","stagnation","turbulence","disruption","imbalance","burden",
 
119
  "déficit","récession","crise","ralentissement","chômage","incertitude",
120
  "guerre","effondrement","instabilité","baisse","fragilité","pression",
 
121
  "عجز","تضخم","ركود","انكماش","أزمة","تدهور","بطالة","انخفاض",
122
  "ضغط","مخاطر","صدمة","عدم استقرار","هشاشة","ديون","عقوبات",
123
  ]
124
 
125
+ # ECON_TRIGGER — متوازن إيجابي + سلبي + مؤشرات
126
  ECON_TRIGGER = [
 
127
  "deficit","risk","crisis","recession","shock","uncertainty",
128
  "slowdown","pressure","vulnerable","weak","deteriorat","downturn",
129
  "contraction","debt","unemployment","inflation","collapse","volatile",
130
  "instability","fragile","stagnation","disruption","sanctions","drought",
 
131
  "growth","recovery","improvement","surplus","stable","expansion",
132
  "resilience","rebound","strengthened","acceleration","robust",
133
  "favorable","progress","increase","upturn","confidence","boom",
 
134
  "gdp","forecast","outlook","trade","fiscal","monetary","exchange",
135
  "interest","budget","revenue","expenditure","policy","reform",
 
136
  "التضخم","الناتج","النمو","العجز","المخاطر","التوقعات","الميزانية",
 
137
  "croissance","déficit","récession","prévision","taux","politique",
138
  ]
139
 
140
  def economic_lexicon_score(text: str) -> float:
 
141
  text_lower = text.lower()
142
  pos = sum(1 for w in ECON_POSITIVE if w in text_lower)
143
  neg = sum(1 for w in ECON_NEGATIVE if w in text_lower)
 
170
  }
171
 
172
  # ============================================================
173
+ # ENSEMBLE: FinBERT (40%) + XLM-RoBERTa (30%) + Lexicon (30%)
174
  # ============================================================
175
  WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}
176
 
 
177
  print("⏳ Loading FinBERT (ProsusAI)...")
178
  try:
179
  finbert_pipe = pipeline(
 
190
  finbert_pipe = None
191
  FINBERT_OK = False
192
 
 
193
  print("⏳ Loading XLM-RoBERTa...")
194
  try:
195
  xlm_pipe = pipeline(
 
212
  return raw if isinstance(raw, list) else [raw]
213
 
214
  def clf_finbert(text: str) -> float:
 
215
  if not FINBERT_OK or finbert_pipe is None:
216
  return 0.0
217
  try:
 
223
  return 0.0
224
 
225
  def clf_xlm(text: str) -> float:
 
226
  if not XLM_OK or xlm_pipe is None:
227
  return 0.0
228
  try:
229
  items = normalize_clf(xlm_pipe(text[:512]))
230
  d = {r["label"]: float(r["score"]) for r in items}
 
231
  pos = d.get("LABEL_2", d.get("positive", d.get("Positive", 0.0)))
232
  neg = d.get("LABEL_0", d.get("negative", d.get("Negative", 0.0)))
233
  return round(pos - neg, 4)
 
237
 
238
  def sentiment_score_numeric(text: str) -> float:
239
  """
240
+ Weighted Ensemble:
241
+ 40% FinBERT + 30% XLM-RoBERTa + 30% Loughran-McDonald
 
 
242
  → [-1, +1]
243
  """
244
  fb = clf_finbert(text)
 
252
  )
253
 
254
  def run_sentiment(text: str):
 
255
  score = sentiment_score_numeric(text)
256
  if score > 0.05:
257
  sent = "Positive 😊"
 
262
  return sent, round(min(abs(score), 1.0), 4)
263
 
264
  def run_sentiment_detailed(text: str) -> str:
 
265
  fb = clf_finbert(text)
266
  xlm = clf_xlm(text)
267
  lex = economic_lexicon_score(text)
 
456
  for fname, info in PER_FILE_INFO.items():
457
  n = sum(1 for m in KB_META if m["name"] == fname)
458
  yr = str(info.get("year","N/A"))
459
+ yrb = f"{yr} ✅" if yr not in ["None","N/A"] else "N/A ⚠️"
460
  badge = " 🟢" if info["is_economic"] else ""
461
  tbl += f"| `{fname}` | {yrb} | {info['type']}{badge} | {info['confidence']:.0%} | {n} |\n"
462
 
 
539
  return []
540
 
541
  # ============================================================
542
+ # SMART ECONOMIC CHUNK SAMPLER
543
  # ============================================================
544
  def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
 
 
 
 
 
 
545
  n = len(texts)
546
  econ = [t for t in texts if any(kw in t.lower() for kw in ECON_TRIGGER)]
 
547
  if len(econ) < 10:
548
  start = texts[:min(10, n)]
549
  mid = texts[n//2-5 : n//2+5] if n > 20 else []
550
  end = texts[-min(10, n):]
551
  econ = list(dict.fromkeys(start + mid + end))
 
 
552
  if len(econ) > max_chunks:
553
  step = max(1, len(econ) // max_chunks)
554
  sample = econ[::step][:max_chunks]
555
  else:
556
  sample = econ
 
557
  return sample
558
 
559
  # ============================================================
 
598
  rag_context = rag_context[:2000]
599
  has_good_rag = bool(results) and results[0]["sem"] >= 0.25
600
  answer_text = llm_groq(question, rag_context, history, lang)
 
601
  if has_good_rag:
602
  src = ", ".join(f"`{r['file']}` p.{r['page']}" for r in results)
603
  badge = f"\n\n📄 **{'المصدر' if lang=='ar' else 'Source'}:** {src}"
 
614
  def predict_with_rag(text):
615
  text = "" if text is None else str(text).strip()
616
  if not text: raise gr.Error("⚠️ Enter text first.")
 
617
  lang = detect_lang(text)
618
  qterms = [t for t in re.findall(r"\w+", text.lower()) if len(t)>2]
619
 
 
647
  flag = "🇸🇦" if h["lang"]=="ar" else "🇺🇸"
648
  md += f"- 🔑 **`{h['word']}`** → 📄 `{h['file']}` p.{h['page']} {flag}\n\n > {h['sentence']}\n\n"
649
 
650
+ detail = run_sentiment_detailed(text)
 
651
  sent, conf = run_sentiment(text)
652
+ md += f"---\n{detail}\n\n"
653
 
654
  md += "---\n## 📍 Exact Location\n\n"
655
  seen2 = set()
 
707
  return pd.DataFrame()
708
 
709
  # ============================================================
710
+ # YEARLY SENTIMENT INDEX
711
  # ============================================================
712
  def build_doc_sentiment_index():
713
  if not KB_TEXTS or not KB_META: return None, None
 
794
  global_mean = float(df_yearly["sentiment"].mean())
795
  merged["sentiment"] = merged["sentiment"].fillna(global_mean)
796
  has_yearly = True
797
+ mode_msg = "✅ **Yearly Ensemble Sentiment** (FinBERT 40%+XLM 30%+Lexicon 30%)"
798
  else:
799
  global_sent = (
800
  float(pd.to_numeric(df_files["sentiment"], errors="coerce").mean())
 
805
  has_yearly = False
806
  mode_msg = "⚠️ **Global Sentiment** — rename files like `WEO_2020.pdf`"
807
 
808
+ # ── 4) ✅ FIXED Normalization feature_range=(-0.3, 0.3) ──
809
+ # يُلطّف تأثير Sentiment ويمنع over-prediction في Panel 1
810
  if merged["sentiment"].std() > 1e-6:
811
+ scaler = MinMaxScaler(feature_range=(-0.3, 0.3))
812
  merged["sentiment"] = scaler.fit_transform(
813
  merged["sentiment"].values.reshape(-1, 1)
814
  ).flatten().round(4)
815
+ print(f"Sentiment normalized [-0.3,+0.3]: {dict(zip(merged['year'], merged['sentiment']))}")
816
 
817
  # ── 5) Train / Test ───────────────────────────────────────
818
  series = merged["value"].values.astype(float)
 
857
  # Panel 1 — Forecast
858
  ax1 = axes[0]
859
  ax1.plot(years, series, "o-", color="#2196F3", label="Actual", lw=2, ms=5)
860
+ ax1.plot(test_years, pred_arima, "s--", color="#FF5722",
861
+ label="ARIMA(1,1,1)", lw=2)
862
+ ax1.plot(test_years, pred_sarimax, "^-.", color="#4CAF50",
863
+ label="SARIMAX+Ensemble", lw=2)
864
+ ax1.axvline(x=years[split-1], color="gray", linestyle=":",
865
+ alpha=0.7, label="Train│Test")
866
  ax1.set_title(
867
  f"📈 {target_var} — {country_code} "
868
  f"({'Yearly' if has_yearly else 'Global'} Ensemble Sentiment)",
 
874
  # Panel 2 — Ensemble Sentiment Timeline
875
  ax2 = axes[1]
876
  s_vals = merged["sentiment"].values
877
+ s_clrs = ["#4CAF50" if s>0.05 else "#FF5722" if s<-0.05 else "#FFC107"
878
+ for s in s_vals]
879
  ax2.bar(years, s_vals, color=s_clrs, edgecolor="white", width=0.6)
880
  ax2.axhline(y=0, color="black", lw=0.8)
881
  ax2.set_title(
882
+ f"📊 Ensemble Sentiment Index "
883
+ f"(FinBERT 40% + XLM 30% + Lexicon 30%)\n"
884
+ f"{'per-year' if has_yearly else 'global'} — normalized [-0.3, +0.3]",
885
  fontsize=10, fontweight="bold"
886
  )
887
+ ax2.set_xlabel("Year")
888
+ ax2.set_ylabel("Sentiment Score (normalized)")
889
  ax2.grid(True, alpha=0.3, axis="y")
890
  ax2.legend(handles=[
891
  Patch(facecolor="#4CAF50", label="Optimistic (>0.05)"),
 
893
  Patch(facecolor="#FF5722", label="Pessimistic (<-0.05)"),
894
  ], loc="upper right", fontsize=8)
895
 
896
+ # Panel 3 — RMSE Comparison
897
  ax3 = axes[2]
898
  clrs = ["#FF5722" if rmse_a <= rmse_s else "#4CAF50",
899
  "#4CAF50" if rmse_s <= rmse_a else "#FF5722"]
900
  bars = ax3.bar(
901
  ["ARIMA(1,1,1)", "SARIMAX\n+Ensemble"],
902
+ [rmse_a, rmse_s],
903
+ color=clrs, width=0.4, edgecolor="white"
904
  )
905
  for bar, val in zip(bars, [rmse_a, rmse_s]):
906
  ax3.text(
907
+ bar.get_x()+bar.get_width()/2,
908
+ bar.get_height()+0.01,
909
+ f"{val:.4f}",
910
+ ha="center", va="bottom", fontweight="bold", fontsize=11
911
  )
912
  ax3.set_title("RMSE Comparison (lower = better)", fontsize=11)
913
+ ax3.set_ylabel("RMSE")
914
+ ax3.grid(True, alpha=0.3, axis="y")
915
 
916
  plt.tight_layout(pad=3.0)
917
  img_path = "/tmp/forecast_plot.png"
 
922
  sent_table = ""
923
  if df_files is not None and len(df_files) > 0:
924
  sent_table = "\n---\n### 📄 Ensemble Sentiment per File\n\n"
925
+ sent_table += "| 📄 File | 📅 Year | 😊 Score | 📦 Chunks | Label |\n"
926
+ sent_table += "|---|---|---|---|---|\n"
927
  for _, row in df_files.iterrows():
928
+ yrb = (f"{row['year']} ✅"
929
+ if str(row['year']) not in ["N/A","None"] else "N/A ⚠️")
930
  sent_table += (
931
  f"| `{row['file']}` | {yrb} "
932
  f"| `{row['sentiment']:+.4f}` "
933
  f"| {row['n_chunks']} | {row['label']} |\n"
934
  )
935
 
936
+ # ── 10) Result Text ────────────────────────────────────────
937
  arrow = "✅ Improved" if impr_rmse > 0 else "❌ No improvement"
938
  result_md = (
939
  f"## 📊 Forecast — Ensemble Sentiment\n\n"
 
946
  f"| 🧪 Test | **{len(test_y)}** samples |\n\n"
947
  f"---\n### 🏆 Model Comparison\n\n"
948
  f"| Model | RMSE | MAE | MAPE |\n|---|---|---|---|\n"
949
+ f"| ARIMA(1,1,1) | `{rmse_a:.4f}` | `{mae_a:.4f}` | `{mape_a:.1f}%` |\n"
950
+ f"| SARIMAX+Ensemble | `{rmse_s:.4f}` | `{mae_s:.4f}` | `{mape_s:.1f}%` |\n"
951
+ f"| **Improvement** | **{impr_rmse:+.1f}%** | **{impr_mae:+.1f}%** | **{impr_mape:+.1f}%** |\n\n"
952
  f"**{arrow}** by adding Ensemble Sentiment Index.\n"
953
  f"{sent_table}"
954
  )
 
989
  def get_top_keywords():
990
  if not KB_TEXTS: return "_No files indexed yet._"
991
  all_words = re.findall(r"\b\w{4,}\b", " ".join(KB_TEXTS).lower())
992
+ stopwords = {
993
+ "this","that","with","from","have","been","will","your",
994
+ "they","their","which","when","what","also","more","some",
995
+ "than","then","were","would","could","into","over","such"
996
+ }
997
  top = Counter(w for w in all_words if w not in stopwords).most_common(20)
998
  return "### 🔑 Top Keywords\n\n" + "\n".join(f"- **{w}**: {c}" for w,c in top)
999
 
 
1052
  gr.Markdown(
1053
  "**Upload PDF / DOCX / TXT / CSV**\n\n"
1054
  "> 💡 Name files like `WEO_2020.pdf` — year in filename required!\n"
1055
+ "> ✅ Year must show ✅ not ⚠️"
1056
  )
1057
  files = gr.File(label="📂 Files",
1058
  file_types=[".pdf",".txt",".csv",".docx"],
 
1069
  load_btn.click(load_saved_index, outputs=persist_status)
1070
 
1071
  with gr.Tab("🎭 2 · Sentiment + Search"):
1072
+ inp = gr.Textbox(
1073
+ lines=2,
1074
+ placeholder="Type text… | اكتب نصاً… | Saisissez un texte…",
1075
+ label="📝 Input (Ensemble: FinBERT 40% + XLM 30% + Lexicon 30%)"
1076
+ )
1077
  run_btn = gr.Button("🔍 Analyze", variant="primary")
1078
  with gr.Row():
1079
  out_sent = gr.Textbox(label="🎭 Ensemble Sentiment", interactive=False)
1080
  out_conf = gr.Number(label="📊 Score", precision=4)
1081
  out_full = gr.Markdown()
1082
+ run_btn.click(predict_with_rag,
1083
+ inputs=inp,
1084
+ outputs=[out_sent, out_conf, out_full])
1085
  gr.Markdown("---")
1086
  with gr.Row():
1087
  dl_btn = gr.Button("⬇️ Download Report", variant="secondary")
1088
  rep_file = gr.File(label="report.md")
1089
+ dl_btn.click(generate_report,
1090
+ inputs=[inp, out_sent, out_conf, out_full],
1091
+ outputs=rep_file)
1092
 
1093
  with gr.Tab("💬 3 · Smart Chatbot"):
1094
  chatbot = gr.Chatbot(height=430, type="messages",
1095
  placeholder="Ask anything… / اسأل أي شيء…")
1096
  msg = gr.Textbox(placeholder="Question…", label="💬")
1097
  with gr.Row():
1098
+ send_btn = gr.Button("Send ➤", variant="primary")
1099
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary")
1100
+ stats_btn = gr.Button("📊 Stats", variant="secondary")
1101
  stats_box = gr.Markdown(visible=False)
1102
  with gr.Row():
1103
  export_btn = gr.Button("💾 Export Chat", variant="secondary")
1104
  export_file = gr.File(label="chat_history.txt")
1105
  msg.submit(chat_text, inputs=[msg,chatbot], outputs=[msg,chatbot])
1106
  send_btn.click(chat_text, inputs=[msg,chatbot], outputs=[msg,chatbot])
1107
+ clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
1108
  stats_btn.click(
1109
  lambda: (get_stats(), gr.update(visible=True)),
1110
  outputs=[stats_box, stats_box]
 
1117
  transcript = gr.Textbox(label="📝 Transcript", interactive=False)
1118
  audio_out = gr.Audio(label="🔊 Answer", type="filepath")
1119
  voice_btn.click(
1120
+ chat_voice,
1121
+ inputs=[audio_in, chatbot],
1122
  outputs=[chatbot, audio_out, transcript]
1123
  )
1124
 
 
1139
  "| 🏦 Sentiment 1 | **FinBERT** — ProsusAI (40%) |\n"
1140
  "| 🌍 Sentiment 2 | **XLM-RoBERTa** — Cardiff NLP (30%) |\n"
1141
  "| 📖 Sentiment 3 | **Loughran-McDonald Lexicon** (30%) |\n"
1142
+ "| ⚡ Ensemble | Weighted 40/30/30 + MinMax[-0.3,+0.3] |\n"
1143
  "| 🔍 Embeddings | paraphrase-multilingual-MiniLM-L12-v2 |\n"
1144
  "| 📊 Reranker | cross-encoder/ms-marco-MiniLM-L-6-v2 |\n"
1145
  "| 🤖 LLM | Llama-3.3-70B via Groq |\n"
 
1164
  "2. **Build Index** → verify Year = ✅\n"
1165
  "3. Select country + variable + year range\n"
1166
  "4. **Run Forecast** → compare RMSE\n\n"
1167
+ "> 🏦 FinBERT(40%) + 🌍 XLM(30%) + 📖 Lexicon(30%)\n"
1168
+ "> Normalized to **[-0.3, +0.3]** for smooth forecasting"
1169
  )
1170
  with gr.Row():
1171
+ country_input = gr.Textbox(
1172
+ value="DZ", label="🌍 Country Code",
1173
+ placeholder="DZ / US / FR / MA / TN / EG"
1174
+ )
1175
  target_input = gr.Dropdown(
1176
+ choices=[
1177
+ "Inflation (CPI %)",
1178
+ "GDP Growth (%)",
1179
+ "Unemployment (%)",
1180
+ "Exchange Rate"
1181
+ ],
1182
+ value="Inflation (CPI %)",
1183
+ label="🎯 Target Variable"
1184
  )
1185
  with gr.Row():
1186
  start_year = gr.Slider(minimum=2000, maximum=2020,
1187
+ value=2000, step=1, label="📅 Start Year")
1188
  end_year = gr.Slider(minimum=2010, maximum=2024,
1189
  value=2023, step=1, label="📅 End Year")
1190
  forecast_btn = gr.Button("📈 Run Forecast", variant="primary", size="lg")
 
1197
  outputs=[forecast_result, forecast_plot]
1198
  )
1199
 
1200
+ # ============================================================
1201
+ # LAUNCH
1202
+ # ============================================================
1203
  app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)