sitayeb commited on
Commit
ed243b3
ยท
verified ยท
1 Parent(s): ade7a04

Update app.py

Browse files

# ============================================================
# IMPORTS
# ============================================================
import re
import os
import math
import pickle
import requests
from collections import Counter
import numpy as np
import pandas as pd
import faiss
import PyPDF2
import torch
import gradio as gr
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from sentence_transformers import SentenceTransformer, CrossEncoder
from langdetect import detect, DetectorFactory
from gtts import gTTS
from transformers import pipeline as hf_pipeline
from transformers import pipeline
from datetime import datetime
from groq import Groq
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

DetectorFactory.seed = 0

GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
groq_client = Groq(api_key=GROQ_API_KEY)

KB_TEXTS = []
KB_META = []
FAISS_INDEX = None
KB_EMB = None
DOC_TYPE_INFO = {"type": "๐Ÿ“„ General", "is_economic": False, "score": 0}
PER_FILE_INFO = {}
CHAT_STATS = {"questions": 0, "found": 0, "not_found": 0}
MIN_SIMILARITY = 0.10

PERSIST_DIR = "/tmp"
KB_TEXTS_PATH = f"{PERSIST_DIR}/kb_texts.pkl"
KB_META_PATH = f"{PERSIST_DIR}/kb_meta.pkl"
FAISS_PATH = f"{PERSIST_DIR}/faiss.index"
os.makedirs(PERSIST_DIR, exist_ok=True)

# ============================================================
# PERSIST
# ============================================================
def save_index():
if FAISS_INDEX is None or not KB_TEXTS:
return "โš ๏ธ No index to save."
try:
with open(KB_TEXTS_PATH, "wb") as f: pickle.dump(KB_TEXTS, f)
with open(KB_META_PATH, "wb") as f: pickle.dump(KB_META, f)
faiss.write_index(FAISS_INDEX, FAISS_PATH)
return f"๐Ÿ’พ Saved! {len(KB_TEXTS):,} chunks"
except Exception as e:
return f"โŒ Save error: {e}"

def load_saved_index():
global KB_TEXTS, KB_META, FAISS_INDEX, DOC_TYPE_INFO
try:
if not os.path.exists(FAISS_PATH):
return "_No saved index found._"
with open(KB_TEXTS_PATH, "rb") as f: KB_TEXTS = pickle.load(f)
with open(KB_META_PATH, "rb") as f: KB_META = pickle.load(f)
FAISS_INDEX = faiss.read_index(FAISS_PATH)
DOC_TYPE_INFO = detect_document_type(KB_TEXTS)
return f"โœ… **Index loaded!** `{len(KB_TEXTS):,}` chunks\n๐Ÿท๏ธ Type: **{DOC_TYPE_INFO['type']}**"
except Exception as e:
return f"โŒ Load error: {e}"

# ============================================================
# KEYWORDS & LEXICONS
# ============================================================
ECONOMIC_KEYWORDS = [
"gdp","inflation","monetary","fiscal","forecast","exchange rate",
"interest rate","unemployment","recession","growth rate","trade balance",
"budget deficit","central bank","economic outlook","imf","world bank",
"cpi","macro","revenue","expenditure","deficit","surplus","debt",
"croissance","taux","banque centrale","prรฉvision","รฉconomique","pib",
"ุงู„ุชุถุฎู…","ุงู„ู†ุงุชุฌ ุงู„ู…ุญู„ูŠ","ุงู„ู†ู…ูˆ ุงู„ุงู‚ุชุตุงุฏูŠ","ุงู„ุจู†ูƒ ุงู„ู…ุฑูƒุฒูŠ","ุณุนุฑ ุงู„ุตุฑู",
]
MEDICAL_KEYWORDS = ["patient","diagnosis","treatment","clinical","hospital","symptom","disease"]
LEGAL_KEYWORDS = ["article","law","contract","clause","jurisdiction","court","legal"]
ACADEMIC_KEYWORDS = ["abstract","methodology","hypothesis","conclusion","references","doi","journal"]

ECON_POSITIVE = [
"growth","recovery","surplus","improvement","stability","increase",
"expansion","acceleration","resilience","upturn","robust","favorable",
"strengthened","progress","rebound","optimistic","confidence","boom",
"prosper","thrive","advance","gain","rise","positive","upward",
"exceed","outperform","strong","healthy","dynamic","sustainable",
"croissance","reprise","amรฉlioration","stabilitรฉ","excรฉdent","hausse",
"ุชุนุงููŠ","ู†ู…ูˆ","ุงุณุชู‚ุฑุงุฑ","ูุงุฆุถ","ุชุญุณู‘ู†","ุงุฑุชูุงุน","ุชูˆุณุน","ุฅูŠุฌุงุจูŠ",
]
ECON_NEGATIVE = [
"deficit","recession","inflation","decline","contraction","debt",
"crisis","deterioration","slowdown","downturn","unemployment","pressure",
"risk","vulnerability","shock","uncertainty","war","sanctions",
"drought","collapse","default","volatile","instability","weak",
"fragile","pessimistic","loss","shrink","fall","negative","downward",
"dรฉficit","rรฉcession","crise","ralentissement","chรดmage","incertitude",
"ุนุฌุฒ","ุชุถุฎู…","ุฑูƒูˆุฏ","ุงู†ูƒู…ุงุด","ุฃุฒู…ุฉ","ุชุฏู‡ูˆุฑ","ุจุทุงู„ุฉ","ุงู†ุฎูุงุถ",
"ุถุบุท","ู…ุฎุงุทุฑ","ุตุฏู…ุฉ","ุนุฏู… ุงุณุชู‚ุฑุงุฑ","ู‡ุดุงุดุฉ","ุฏูŠูˆู†",
]
ECON_TRIGGER = [
"deficit","risk","crisis","recession","shock","uncertainty",
"slowdown","pressure","vulnerable","weak","deteriorat","downturn",
"growth","recovery","improvement","surplus","stable","expansion",
"resilience","rebound","gdp","forecast","outlook","trade","fiscal",
"monetary","exchange","interest","budget","revenue","expenditure",
"ุงู„ุชุถุฎู…","ุงู„ู†ุงุชุฌ","ุงู„ู†ู…ูˆ","ุงู„ุนุฌุฒ","ุงู„ู…ุฎุงุทุฑ","ุงู„ุชูˆู‚ุนุงุช",
"croissance","dรฉficit","rรฉcession","prรฉvision","taux","politique",
]

def economic_lexicon_score(text: str) -> float:
text_lower = text.lower()
pos = sum(1 for w in ECON_POSITIVE if w in text_lower)
neg = sum(1 for w in ECON_NEGATIVE if w in text_lower)
total = max(pos + neg, 1)
return round((pos - neg) / total, 4)

def detect_document_type(texts: list) -> dict:
if not texts:
return {"type":"๐Ÿ“„ General","is_economic":False,"score":0,"confidence":0.0}
full_text = " ".join(texts[:30]).lower()
scores = {
"economic": sum(1 for kw in ECONOMIC_KEYWORDS if kw in full_text),
"medical" : sum(1 for kw in MEDICAL_KEYWORDS if kw in full_text),
"legal" : sum(1 for kw in LEGAL_KEYWORDS if kw in full_text),
"academic": sum(1 for kw in ACADEMIC_KEYWORDS if kw in full_text),
"general" : 1,
}
doc_type = max(scores, key=scores.get)
confidence = round(scores[doc_type] / max(sum(scores.values()), 1), 2)
icons = {
"economic":"๐Ÿ“Š Economic","medical":"๐Ÿฅ Medical",
"legal":"โš–๏ธ Legal","academic":"๐ŸŽ“ Academic","general":"๐Ÿ“„ General",
}
return {
"type" : icons.get(doc_type, "๐Ÿ“„ General"),
"raw_type" : doc_type,
"is_economic": doc_type == "economic" and scores["economic"] >= 3,
"score" : scores[doc_type],
"confidence" : confidence,
}

# ============================================================
# AI MODELS
# ============================================================
WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}

print("โณ Loading FinBERT...")
try:
finbert_pipe = pipeline(
"text-classification", model="ProsusAI/finbert",
tokenizer="ProsusAI/finbert", return_all_scores=True,
device=0 if torch.cuda.is_available() else -1,
)
FINBERT_OK = True
except Exception as e:
print(f"โš ๏ธ FinBERT: {e}"); finbert_pipe = None; FINBERT_OK = False

print("โณ Loading XLM-RoBERTa...")
try:
xlm_pipe = pipeline(
"text-classification",
model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
tokenizer="cardiffnlp/twitter-xlm-roberta-base-sentiment",
return_all_scores=True,
device=0 if torch.cuda.is_available() else -1,
)
XLM_OK = True
except Exception as e:
print(f"โš ๏ธ XLM: {e}"); xlm_pipe = None; XLM_OK = False

def normalize_clf(raw):
if isinstance(raw, list) and raw and isinstance(raw[0], list): raw = raw[0]
return raw if isinstance(raw, list) else [raw]

def clf_finbert(text: str) -> float:
if not FINBERT_OK or finbert_pipe is None: return 0.0
try:
items = normalize_clf(finbert_pipe(text[:512]))
d = {r["label"].lower(): float(r["score"]) for r in items}
return round(d.get("positive", 0.0) - d.get("negative", 0.0), 4)
except: return 0.0

def clf_xlm(text: str) -> float:
if not XLM_OK or xlm_pipe is None: return 0.0
try:
items = normalize_clf(xlm_pipe(text[:512]))
d = {r["label"]: float(r["score"]) for r in items}
pos = d.get("LABEL_2", d.get("positive", d.get("Positive", 0.0)))
neg = d.get("LABEL_0", d.get("negative", d.get("Negative", 0.0)))
return round(pos - neg, 4)
except: return 0.0

def sentiment_score_numeric(text: str) -> float:
fb = clf_finbert(text)
xlm = clf_xlm(text)
lex = economic_lexicon_score(text)
return round(WEIGHTS["finbert"]*fb + WEIGHTS["xlm"]*xlm + WEIGHTS["lexicon"]*lex, 4)

def run_sentiment(text: str):
score = sentiment_score_numeric(text)
if score > 0.05: sent = "Positive ๐Ÿ˜Š"
elif score < -0.05: sent = "Negative ๐Ÿ˜ž"
else: sent = "Neutral ๐Ÿ˜"
return sent, round(min(abs(score), 1.0), 4)

def run_sentiment_detailed(text: str) -> str:
fb = clf_finbert(text)
xlm = clf_xlm(text)
lex = economic_lexicon_score(text)
final = sentiment_score_numeric(text)
def bar(s):
filled = max(0, min(10, round((s + 1) / 2 * 10)))
icon = "๐ŸŸฉ" if s > 0.05 else "๐ŸŸฅ" if s < -0.05 else "๐ŸŸจ"
return icon * filled + "โฌœ" * (10 - filled)
label = "๐ŸŸข **Positive**" if final > 0.05 else "๐Ÿ”ด **Negative**" if final < -0.05 else "๐ŸŸก **Neutral**"
return (
f"### ๐Ÿ† Ensemble Sentiment Breakdown\n\n"
f"| Model | Score | Bar | Weight |\n|---|---|---|---|\n"
f"| ๐Ÿฆ FinBERT | `{fb:+.4f}` | {bar(fb)} | **40%** |\n"
f"| ๐ŸŒ XLM-RoBERTa | `{xlm:+.4f}` | {bar(xlm)} | **30%** |\n"
f"| ๐Ÿ“– Lexicon | `{lex:+.4f}` | {bar(lex)} | **30%** |\n"
f"| โšก **Final** | **`{final:+.4f}`** | {bar(final)} | **100%** |\n\n"
f"{label}"
)

print("โณ Loading Embedder + Reranker + ASR...")
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
asr = hf_pipeline(
"automatic-speech-recognition", model="openai/whisper-small",

Files changed (1) hide show
  1. app.py +522 -176
app.py CHANGED
@@ -34,7 +34,7 @@ DetectorFactory.seed = 0
34
  # ============================================================
35
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
36
  groq_client = Groq(api_key=GROQ_API_KEY)
37
- print(f"DEBUG โ€” Groq Key: {bool(GROQ_API_KEY)}")
38
 
39
  # ============================================================
40
  # GLOBAL STATE
@@ -54,6 +54,9 @@ KB_META_PATH = f"{PERSIST_DIR}/kb_meta.pkl"
54
  FAISS_PATH = f"{PERSIST_DIR}/faiss.index"
55
  os.makedirs(PERSIST_DIR, exist_ok=True)
56
 
 
 
 
57
  def save_index():
58
  if FAISS_INDEX is None or not KB_TEXTS:
59
  return "โš ๏ธ No index to save."
@@ -150,7 +153,7 @@ def detect_document_type(texts: list) -> dict:
150
  }
151
  doc_type = max(scores, key=scores.get)
152
  confidence = round(scores[doc_type] / max(sum(scores.values()), 1), 2)
153
- icons = {
154
  "economic":"๐Ÿ“Š Economic","medical":"๐Ÿฅ Medical",
155
  "legal":"โš–๏ธ Legal","academic":"๐ŸŽ“ Academic","general":"๐Ÿ“„ General",
156
  }
@@ -163,11 +166,11 @@ def detect_document_type(texts: list) -> dict:
163
  }
164
 
165
  # ============================================================
166
- # AI MODELS
167
  # ============================================================
168
  WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}
169
 
170
- print("โณ Loading FinBERT (ProsusAI)...")
171
  try:
172
  finbert_pipe = pipeline(
173
  "text-classification",
@@ -230,9 +233,9 @@ def sentiment_score_numeric(text: str) -> float:
230
 
231
  def run_sentiment(text: str):
232
  score = sentiment_score_numeric(text)
233
- if score > 0.05: sent = "Positive ๐Ÿ˜Š"
234
  elif score < -0.05: sent = "Negative ๐Ÿ˜ž"
235
- else: sent = "Neutral ๐Ÿ˜"
236
  return sent, round(min(abs(score), 1.0), 4)
237
 
238
  def run_sentiment_detailed(text: str) -> str:
@@ -248,37 +251,50 @@ def run_sentiment_detailed(text: str) -> str:
248
  return (
249
  f"### ๐Ÿ† Ensemble Sentiment Breakdown\n\n"
250
  f"| Model | Score | Bar | Weight |\n|---|---|---|---|\n"
251
- f"| ๐Ÿฆ FinBERT | `{fb:+.4f}` | {bar(fb)} | **40%** |\n"
252
- f"| ๐ŸŒ XLM-RoBERTa | `{xlm:+.4f}` | {bar(xlm)} | **30%** |\n"
253
- f"| ๐Ÿ“– Lexicon | `{lex:+.4f}` | {bar(lex)} | **30%** |\n"
254
- f"| โšก **Final** | **`{final:+.4f}`** | {bar(final)} | **100%** |\n\n"
255
  f"{label}"
256
  )
257
 
258
- print("โณ Loading Embedder + Reranker + ASR...")
 
 
 
259
  embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
260
  reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
261
- asr = hf_pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1)
 
 
 
 
262
  _ = embedder.encode(["warmup"], convert_to_numpy=True)
263
  print("โœ… All models loaded!")
264
 
265
  _startup = load_saved_index()
266
- print(f"๐Ÿ”„ Startup: {_startup}")
267
 
268
  # ============================================================
269
  # RAG CORE
270
  # ============================================================
271
- def clean_filename(path: str) -> str: return os.path.basename(str(path))
 
 
272
  def detect_lang(text: str) -> str:
273
- try: return "ar" if str(detect(str(text)[:300])).startswith("ar") else "en"
274
- except: return "en"
 
 
275
 
276
  def extract_year_from_filename(filename: str):
277
  full_path = str(filename).replace("\\", "/")
278
  for part in reversed(full_path.split("/")):
279
  m = re.findall(r"\b(20\d{2}|19\d{2})\b", part)
280
  if m: return int(m[0])
281
- for pat in [r'WEO[_\-\s]?(\d{4})', r'BOA[_\-\s]?(\d{4})', r'IMF[_\-\s]?(\d{4})', r'rapport[_\-\s]?(\d{4})', r'report[_\-\s]?(\d{4})']:
 
 
282
  m = re.search(pat, full_path, re.IGNORECASE)
283
  if m: return int(m.group(1))
284
  all_y = re.findall(r'\b(19\d{2}|20\d{2})\b', full_path)
@@ -324,12 +340,15 @@ def load_file(path):
324
  from docx import Document
325
  doc = Document(path)
326
  pars = [p.text for p in doc.paragraphs if p.text.strip()]
327
- return [{"text": "\n".join(pars[i:i+50]), "page": i//50+1} for i in range(0, len(pars), 50)] or [{"text":"Empty DOCX.","page":1}]
328
- except Exception as e: return [{"text": f"DOCX error: {e}", "page": 1}]
 
 
329
  if path.endswith(".csv"):
330
  df = pd.read_csv(path)
331
  col = "text" if "text" in df.columns else df.columns[0]
332
- return [{"text": t, "page": i+1} for i, t in enumerate(df[col].dropna().astype(str))]
 
333
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
334
  return [{"text": f.read(), "page": 1}]
335
 
@@ -340,27 +359,32 @@ def build_index(files):
340
  file_paths = []
341
  if not isinstance(files, list): files = [files]
342
  for f in files:
343
- if isinstance(f, str): file_paths.append(f)
344
- elif isinstance(f, dict): file_paths.append(f.get("path") or f.get("name") or str(f))
345
- elif hasattr(f, "name"): file_paths.append(f.name)
346
- else: file_paths.append(str(f))
347
 
348
  for p in file_paths:
349
  full_path = str(p)
350
  fname = clean_filename(full_path)
351
  year = extract_year_from_filename(fname) or extract_year_from_filename(full_path)
352
- pages, file_texts = load_file(full_path), []
 
353
  for pg in pages:
354
  for ch in chunk_text(pg["text"]):
355
  KB_TEXTS.append(ch)
356
- KB_META.append({"name": fname, "lang": detect_lang(ch), "page": pg["page"], "year": year})
 
357
  file_texts.append(ch)
358
  ti = detect_document_type(file_texts)
359
  ti["year"] = year
360
  PER_FILE_INFO[fname] = ti
361
 
362
  if not KB_TEXTS: raise gr.Error("โš ๏ธ No text extracted.")
363
- KB_EMB = embedder.encode(KB_TEXTS, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False).astype("float32")
 
 
 
364
  FAISS_INDEX = faiss.IndexFlatIP(KB_EMB.shape[1])
365
  FAISS_INDEX.add(KB_EMB)
366
  DOC_TYPE_INFO = detect_document_type(KB_TEXTS)
@@ -375,20 +399,33 @@ def build_index(files):
375
  tbl += f"| `{fname}` | {yrb} | {info['type']}{badge} | {info['confidence']:.0%} | {n} |\n"
376
 
377
  ef = [f for f,i in PER_FILE_INFO.items() if i["is_economic"]]
378
- fmsg = (f"\n\n๐ŸŸข **Economic:** " + ", ".join(f"`{f}`" for f in ef) + "\nโžก๏ธ Go to **๐Ÿ“ˆ 7 ยท Forecast** tab.") if ef else ""
 
 
 
 
379
  save_index()
380
- return f"โœ… **Index built!**\n\n| | |\n|---|---|\n| ๐Ÿ“ฆ Chunks | **{len(KB_TEXTS):,}** |\n| ๐Ÿ“„ Files | **{len(file_paths)}** |\n| ๐Ÿ‡ธ๐Ÿ‡ฆ Arabic | **{lang_count.get('ar',0):,}** |\n| ๐Ÿ‡บ๐Ÿ‡ธ English| **{lang_count.get('en',0):,}** |\n\n---\n### ๐Ÿ“‹ Per-File\n\n{tbl}{fmsg}"
 
 
 
 
 
 
 
 
381
 
382
  def bm25_score(query_terms, doc, k1=1.5, b=0.75, avg_dl=200):
383
  try:
384
  if not KB_TEXTS or not isinstance(doc, str): return 0.0
385
- dl, score, df = len(doc.split()), 0.0, Counter(doc.lower().split())
 
386
  for term in query_terms:
387
  if not isinstance(term, str) or not term: continue
388
- tl = term.lower()
389
  n_doc = sum(1 for t in KB_TEXTS if isinstance(t,str) and tl in t.lower())
390
- tf = df.get(tl, 0)
391
- idf = math.log((len(KB_TEXTS)+1)/(1+n_doc))
392
  score += idf*(tf*(k1+1))/(tf+k1*(1-b+b*dl/max(avg_dl,1)))
393
  return score
394
  except: return 0.0
@@ -396,24 +433,28 @@ def bm25_score(query_terms, doc, k1=1.5, b=0.75, avg_dl=200):
396
  def rag_retrieve(query, k=5, top_n=3):
397
  if FAISS_INDEX is None or not KB_TEXTS: return []
398
  try:
399
- q_emb = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
 
 
400
  scores, idx = FAISS_INDEX.search(q_emb, min(k*3, len(KB_TEXTS)))
401
  candidates, qterms = [], [t for t in re.findall(r"\w+", str(query).lower()) if t]
402
  for rank, i in enumerate(idx[0]):
403
  if i == -1: continue
404
- sem = float(scores[0][rank])
405
  if sem < MIN_SIMILARITY: continue
406
  text = KB_TEXTS[i]
407
  if not isinstance(text, str): continue
408
  kw = bm25_score(qterms, text)
409
  lterms = [t for t in qterms if len(t) > 2]
410
- try: exact = all(re.search(rf"\b{re.escape(t)}\b", text.lower()) for t in lterms) if lterms else False
 
411
  except: exact = False
412
  hybrid = sem*0.6 + min(kw/10, 0.4) + (0.15 if exact else 0.0)
413
  candidates.append({
414
  "idx": i, "sem": sem, "kw": kw, "exact": exact, "hybrid": hybrid,
415
  "lang": KB_META[i]["lang"], "file": KB_META[i]["name"],
416
- "page": KB_META[i]["page"], "year": KB_META[i].get("year"), "text": text,
 
417
  })
418
  if not candidates: return []
419
  ce_scores = reranker.predict([[query, c["text"]] for c in candidates])
@@ -428,7 +469,7 @@ def rag_retrieve(query, k=5, top_n=3):
428
  return []
429
 
430
  def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
431
- n = len(texts)
432
  econ = [t for t in texts if any(kw in t.lower() for kw in ECON_TRIGGER)]
433
  if len(econ) < 10:
434
  start = texts[:min(10, n)]
@@ -438,7 +479,8 @@ def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
438
  if len(econ) > max_chunks:
439
  step = max(1, len(econ) // max_chunks)
440
  sample = econ[::step][:max_chunks]
441
- else: sample = econ
 
442
  return sample
443
 
444
  def llm_groq(question, rag_context, history, lang):
@@ -451,24 +493,32 @@ def llm_groq(question, rag_context, history, lang):
451
  "- Be concise, helpful, accurate."
452
  )
453
  messages = [{"role": "system", "content": system_prompt}]
454
- for turn in history[-4:]: messages.append({"role": turn["role"], "content": turn["content"]})
 
455
  user_content = f"๐Ÿ“„ Context:\n{rag_context}\n\nQuestion: {question}" if rag_context else question
456
  messages.append({"role": "user", "content": user_content})
457
  try:
458
- r = groq_client.chat.completions.create(model="llama-3.3-70b-versatile", messages=messages, temperature=0.3, max_tokens=512)
 
 
 
 
 
459
  return r.choices[0].message.content.strip()
460
- except Exception as e: return f"โš ๏ธ Groq error: {e}"
 
461
 
462
  def smart_answer(question, history):
463
- lang = detect_lang(question)
464
  results = rag_retrieve(question, k=5, top_n=3)
465
  rag_context = ""
466
  if results:
467
- for r in results: rag_context += f"[Source: {r['file']} - Page {r['page']}]\n{r['text']}\n\n"
 
468
  has_good_rag = bool(results) and results[0]["sem"] >= 0.25
469
- answer_text = llm_groq(question, rag_context[:2000], history, lang)
470
  if has_good_rag:
471
- src = ", ".join(f"`{r['file']}` p.{r['page']}" for r in results)
472
  badge = f"\n\n๐Ÿ“„ **{'ุงู„ู…ุตุฏุฑ' if lang=='ar' else 'Source'}:** {src}"
473
  CHAT_STATS["found"] += 1
474
  else:
@@ -480,7 +530,8 @@ def smart_answer(question, history):
480
  def predict_with_rag(text):
481
  text = "" if text is None else str(text).strip()
482
  if not text: raise gr.Error("โš ๏ธ Enter text first.")
483
- lang, qterms = detect_lang(text), [t for t in re.findall(r"\w+", text.lower()) if len(t)>2]
 
484
  exact_hits = []
485
  for i, chunk in enumerate(KB_TEXTS):
486
  if not isinstance(chunk, str): continue
@@ -490,8 +541,13 @@ def predict_with_rag(text):
490
  if re.search(rf"\b{re.escape(term)}\b", cl):
491
  for s in re.split(r"(?<=[.!?ุŸ\n])\s+", chunk):
492
  if re.search(rf"\b{re.escape(term)}\b", s.lower()):
493
- exact_hits.append({"word": term, "file": KB_META[i]["name"], "sentence": s.strip(), "lang": KB_META[i]["lang"], "chunk_id": i, "page": KB_META[i]["page"]})
 
 
 
 
494
  except: continue
 
495
  sem_results, md = rag_retrieve(text, k=5, top_n=3), ""
496
  if exact_hits:
497
  seen, unique = set(), []
@@ -510,33 +566,49 @@ def predict_with_rag(text):
510
  k2 = (h["file"], h["chunk_id"])
511
  if k2 in seen2: continue
512
  seen2.add(k2)
513
- md += f"### ๐Ÿ“„ `{h['file']}` โ€” p.{h['page']} {'๐Ÿ‡ธ๐Ÿ‡ฆ' if h['lang']=='ar' else '๐Ÿ‡บ๐Ÿ‡ธ'}\n\n```\n{KB_TEXTS[h['chunk_id']]}\n```\n\n"
514
  else:
515
  sent, conf = "โŒ Not found", 0.0
516
- md += f"## โŒ ุงู„ูƒู„ู…ุฉ ุบูŠุฑ ู…ูˆุฌูˆุฏุฉ\n\n**`{text}`** ู„ู… ุชูุฐูƒุฑ ุญุฑููŠุงู‹.\n\n" if lang=="ar" else f"## โŒ Word Not Found\n\n**`{text}`** not found literally.\n\n"
 
 
 
 
517
  if sem_results:
518
  md += "---\n## ๐Ÿ” Semantic Results\n\n"
519
  for r in sem_results:
520
- bar = "๐ŸŸฉ"*round(r["sem"]*10) + "โฌœ"*(10-round(r["sem"]*10))
521
  snippet = r["text"][:300].strip()
522
  for t in qterms:
523
  try: snippet = re.sub(rf"(?i)({re.escape(t)})", r"**\1**", snippet)
524
  except: pass
525
- md += f"### Result {r['rank']} โ€” {bar} `{r['sem']*100:.1f}%` {'๐Ÿ‡ธ๐Ÿ‡ฆ' if r['lang']=='ar' else '๐Ÿ‡บ๐Ÿ‡ธ'}\n\n๐Ÿ“„ `{r['file']}` p.{r['page']}\n\n> {snippet}...\n\n"
526
- else: md += "---\n_No similar content found._\n"
 
 
 
 
 
527
  return sent, round(conf, 4), md
528
 
529
  # ============================================================
530
- # DATA & ECONOMETRICS
531
  # ============================================================
532
  def get_worldbank_data(country_code, indicator, start_year, end_year):
533
- url = f"https://api.worldbank.org/v2/country/{country_code}/indicator/{indicator}?date={start_year}:{end_year}&per_page=100&format=json"
 
 
 
534
  try:
535
  resp = requests.get(url, timeout=15)
536
  resp.raise_for_status()
537
  data = resp.json()
538
  if not data or len(data) < 2 or not data[1]: return pd.DataFrame()
539
- rows = [{"year": int(e["date"]), "value": float(e["value"])} for e in data[1] if e.get("value") is not None and e.get("date") is not None]
 
 
 
 
540
  return pd.DataFrame(rows).dropna().sort_values("year").reset_index(drop=True)
541
  except Exception as e:
542
  print(f"World Bank error: {e}")
@@ -553,16 +625,26 @@ def build_doc_sentiment_index():
553
  sample = get_economic_chunks(texts, max_chunks=40)
554
  scores = [sentiment_score_numeric(t) for t in sample]
555
  avg = round(float(np.mean(scores)), 4)
556
- year = next((m["year"] for m in KB_META if m["name"]==fname and m.get("year")), None)
 
 
557
  file_results.append({
558
- "file": fname, "year": year if year else "N/A", "sentiment": avg, "n_chunks": len(sample),
559
- "label": "๐ŸŸข Optimistic" if avg > 0.05 else "๐Ÿ”ด Pessimistic" if avg < -0.05 else "๐ŸŸก Neutral"
 
560
  })
561
- if year: yearly_sentiment.setdefault(year, []).append(avg)
 
562
 
563
- yearly_avg = {yr: round(float(np.mean(vals)), 4) for yr, vals in yearly_sentiment.items()}
 
 
 
564
  df_files = pd.DataFrame(file_results).sort_values("year")
565
- df_yearly = pd.DataFrame([{"year": y, "sentiment": s} for y, s in sorted(yearly_avg.items())]) if yearly_avg else None
 
 
 
566
  return df_files, df_yearly
567
 
568
  def run_adf_check(series: np.ndarray, name: str):
@@ -570,62 +652,79 @@ def run_adf_check(series: np.ndarray, name: str):
570
  def adf_p(s):
571
  try: return adfuller(s, autolag='AIC')[1]
572
  except: return 1.0
573
-
574
  s = series.copy()
575
  p0 = adf_p(s)
576
- if p0 <= 0.05: return s, f"โœ… Stationary at level (p={p0:.4f})", False
577
-
578
  s1 = np.diff(s)
579
  p1 = adf_p(s1)
580
- if p1 <= 0.05: return s1, f"โš ๏ธ Non-stat (p={p0:.4f}) โ†’ 1st diff โ†’ โœ… stat (p={p1:.4f})", True
581
-
582
  s2 = np.diff(s1)
583
  p2 = adf_p(s2)
584
- return s2, f"โš ๏ธ Non-stat (p={p0:.4f}) โ†’ 1st diff (p={p1:.4f}) โ†’ 2nd diff โ†’ {'โœ… stat' if p2<=0.05 else 'โš ๏ธ non-stat'} (p={p2:.4f})", True
 
 
 
 
 
585
 
586
  def run_granger_test(series_y, series_exog, maxlag=4):
587
  try:
588
  from statsmodels.tsa.stattools import grangercausalitytests
589
- if len(series_y) < 10: return "โš ๏ธ **Granger Test skipped** โ€” need โ‰ฅ 10 points.", False
 
590
  sy, status_y = run_adf_check(series_y.copy(), "Target")[:2]
591
  sexog, status_exog = run_adf_check(series_exog.copy(), "Sentiment")[:2]
592
-
593
  min_len = min(len(sy), len(sexog))
594
  sy, sexog = sy[-min_len:], sexog[-min_len:]
595
  maxlag = min(maxlag, max(1, (len(sy) - 1) // 3))
596
-
597
- if len(sy) < 5: return "โš ๏ธ **Granger Test skipped** โ€” too few obs after differencing.", False
598
-
599
- gc_result = grangercausalitytests(np.column_stack([sy, sexog]), maxlag=maxlag, verbose=False)
 
600
  rows, any_pass, best_p = [], False, 1.0
601
  for lag, res in gc_result.items():
602
  p_val = res[0]["ssr_ftest"][1]
603
- if p_val < 0.05: sig = "โœ… Yes"; any_pass = True
604
- elif p_val < 0.10: sig = "๐Ÿ”ถ Marginal"
605
- else: sig = "โŒ No"
 
606
  best_p = min(best_p, p_val)
607
- rows.append(f"| {lag} | {res[0]['ssr_ftest'][0]:.4f} | {p_val:.4f} | {sig} |")
608
 
609
  table = (
610
  "### ๐Ÿ”ฌ Granger Causality Test\n"
611
  "*Hโ‚€: Sentiment does NOT Granger-cause Target*\n\n"
612
- f"| Series | ADF Result |\n|---|---|\n| ๐ŸŽฏ Target | {status_y} |\n| ๐Ÿ˜Š Sentiment | {status_exog} |\n\n"
613
- "| Lag | F-stat | p-value | Significant? |\n|-----|--------|---------|-------------|\n" + "\n".join(rows)
 
 
 
 
 
614
  )
615
- if any_pass: verdict = "\n\nโœ… **PASS** โ€” Sentiment significantly Granger-causes the target (p < 0.05)."
616
- elif best_p < 0.10: verdict = f"\n\n๐Ÿ”ถ **MARGINAL** โ€” best p = {best_p:.4f} (< 0.10)."
617
- else: verdict = "\n\nโŒ **FAIL** โ€” No significant Granger causality (p โ‰ฅ 0.05)."
 
 
 
618
  return table + verdict, any_pass
619
- except Exception as e: return f"โš ๏ธ Granger test error: `{e}`\n", False
 
620
 
621
  def run_dm_test(actual, pred_arima, pred_sarimax):
622
  try:
623
  n = len(actual)
624
- if n < 3: return "โš ๏ธ **DM Test skipped** โ€” only n < 3.", False
625
- d = (actual - pred_arima)**2 - (actual - pred_sarimax)**2
626
- d_mean, d_std = np.mean(d), np.std(d, ddof=1)
627
- if d_std < 1e-10: return "โš ๏ธ **DM Test** โ€” models identical.", False
628
-
 
 
629
  dm_stat = d_mean / (d_std / np.sqrt(n))
630
  p_val = 2 * (1 - stats.t.cdf(abs(dm_stat), df=n - 1))
631
  sig = "โœ… Yes" if p_val < 0.05 else ("๐Ÿ”ถ Marginal" if p_val < 0.10 else "โŒ No")
@@ -633,23 +732,36 @@ def run_dm_test(actual, pred_arima, pred_sarimax):
633
 
634
  table = (
635
  "### ๐ŸŽฏ Diebold-Mariano Test\n"
 
636
  "| DM Statistic | p-value | n (test) | Significant? | Better Model |\n"
637
  "|-------------|---------|----------|-------------|-------------|\n"
638
  f"| `{dm_stat:.4f}` | `{p_val:.4f}` | `{n}` | {sig} | **{better}** |\n"
639
  )
640
  passed = p_val < 0.05 and dm_stat > 0
641
- if passed: verdict = "\nโœ… **PASS** โ€” SARIMAX+Ensemble is **significantly better** (p < 0.05)."
642
- elif (p_val < 0.10) and dm_stat > 0: verdict = f"\n๐Ÿ”ถ **MARGINAL** โ€” p = {p_val:.4f} (< 0.10)."
643
- else: verdict = f"\nโŒ **FAIL** โ€” Not statistically significant (p = {p_val:.4f}). Limited power with n={n}."
 
 
 
 
 
 
 
644
  return table + verdict, passed
645
- except Exception as e: return f"โš ๏ธ DM error: `{e}`\n", False
 
646
 
 
 
 
647
  def run_economic_forecast(country_code, target_var, start_year, end_year):
648
  try:
649
  from statsmodels.tsa.arima.model import ARIMA
650
  from statsmodels.tsa.statespace.sarimax import SARIMAX
651
  from sklearn.metrics import mean_squared_error, mean_absolute_error
652
- except ImportError: return "โŒ pip install statsmodels scikit-learn", None
 
653
 
654
  indicator_map = {
655
  "Inflation (CPI %)" : "FP.CPI.TOTL.ZG",
@@ -657,190 +769,424 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
657
  "Unemployment (%) ": "SL.UEM.TOTL.ZS",
658
  "Exchange Rate" : "PA.NUS.FCRF",
659
  }
660
- econ_df = get_worldbank_data(country_code, indicator_map.get(target_var, "FP.CPI.TOTL.ZG"), int(start_year), int(end_year))
661
- if econ_df.empty: return f"โŒ No data for **{country_code}** / **{target_var}**", None
662
- if len(econ_df) < 5: return f"โš ๏ธ Only **{len(econ_df)}** data points. Widen year range.", None
 
 
 
 
 
 
663
 
664
  df_files, df_yearly = build_doc_sentiment_index()
665
 
666
  if df_yearly is not None and len(df_yearly) >= 2:
667
- merged = econ_df.merge(df_yearly, on="year", how="left")
668
- merged["sentiment"] = merged["sentiment"].fillna(float(df_yearly["sentiment"].mean()))
669
- has_yearly, mode_msg = True, "โœ… **Yearly Ensemble Sentiment**"
 
 
 
670
  else:
671
- global_sent = float(pd.to_numeric(df_files["sentiment"], errors="coerce").mean()) if df_files is not None and len(df_files) > 0 else 0.0
 
 
 
672
  merged = econ_df.copy()
673
  merged["sentiment"] = global_sent
674
- has_yearly, mode_msg = False, "โš ๏ธ **Global Sentiment**"
 
675
 
676
  if merged["sentiment"].std() > 1e-6:
677
  scaler = MinMaxScaler(feature_range=(-0.3, 0.3))
678
- merged["sentiment"] = scaler.fit_transform(merged["sentiment"].values.reshape(-1, 1)).flatten().round(4)
 
 
679
 
680
  series = merged["value"].values.astype(float)
681
  exog = merged["sentiment"].values.reshape(-1, 1)
682
  years = merged["year"].values
683
  n = len(series)
684
 
685
- # ==========================================================
686
- # โœ… ุงู„ุนูˆุฏุฉ ุฅู„ู‰ n = 3 ุจู†ุงุกู‹ ุนู„ู‰ ู†ุชุงุฆุฌูƒ ุงู„ู…ู…ุชุงุฒุฉ
687
- # ==========================================================
688
  split = n - 3
689
- if split < 5:
690
- split = max(int(n * 0.75), 5) # Fallback if data is too short
691
 
692
- train_y, test_y = series[:split], series[split:]
693
  train_exog, test_exog = exog[:split], exog[split:]
694
  test_years = years[split:]
695
 
 
696
  try:
697
- m1 = ARIMA(train_y, order=(1,1,1)).fit()
698
  pred_arima = m1.forecast(len(test_y))
699
  rmse_a = float(np.sqrt(mean_squared_error(test_y, pred_arima)))
700
  mae_a = float(mean_absolute_error(test_y, pred_arima))
701
  mape_a = float(np.mean(np.abs((test_y-pred_arima)/np.maximum(np.abs(test_y),1e-8)))*100)
702
- except Exception as e: return f"โŒ ARIMA error: {e}", None
 
703
 
 
704
  try:
705
- m2 = SARIMAX(train_y, exog=train_exog, order=(1,1,1)).fit(disp=False)
706
  pred_sarimax = m2.forecast(len(test_y), exog=test_exog)
707
  rmse_s = float(np.sqrt(mean_squared_error(test_y, pred_sarimax)))
708
  mae_s = float(mean_absolute_error(test_y, pred_sarimax))
709
  mape_s = float(np.mean(np.abs((test_y-pred_sarimax)/np.maximum(np.abs(test_y),1e-8)))*100)
710
- except Exception as e: return f"โŒ SARIMAX error: {e}", None
 
711
 
712
  impr_rmse = (rmse_a - rmse_s) / rmse_a * 100
713
  impr_mae = (mae_a - mae_s) / mae_a * 100
714
  impr_mape = (mape_a - mape_s) / mape_a * 100
715
 
 
716
  if has_yearly and df_yearly is not None and len(df_yearly) >= 5:
717
  real_merged = econ_df.merge(df_yearly, on="year", how="inner")
718
- gc_y, gc_exog = real_merged["value"].values.astype(float), real_merged["sentiment"].values.astype(float)
 
719
  else:
720
- gc_y, gc_exog = series, merged["sentiment"].values
 
721
 
722
  granger_md, granger_pass = run_granger_test(gc_y, gc_exog, maxlag=4)
723
  dm_md, dm_pass = run_dm_test(test_y, np.array(pred_arima), np.array(pred_sarimax))
724
 
 
 
 
725
  fig, axes = plt.subplots(4, 1, figsize=(11, 18))
 
 
726
  axes[0].plot(years, series, "o-", color="#2196F3", label="Actual", lw=2, ms=5)
727
- axes[0].plot(test_years, pred_arima, "s--", color="#FF5722", label="ARIMA(1,1,1)", lw=2)
728
  axes[0].plot(test_years, pred_sarimax, "^-.", color="#4CAF50", label="SARIMAX+Ensemble", lw=2)
729
  axes[0].axvline(x=years[split-1], color="gray", linestyle=":", alpha=0.7, label="Trainโ”‚Test")
730
- axes[0].set_title(f"๐Ÿ“ˆ {target_var} โ€” {country_code} | n_test={len(test_y)}", fontsize=12, fontweight="bold")
 
 
 
 
731
  axes[0].legend(fontsize=9); axes[0].grid(True, alpha=0.3)
732
 
733
- s_clrs = ["#4CAF50" if s>0.05 else "#FF5722" if s<-0.05 else "#FFC107" for s in merged["sentiment"]]
 
 
 
 
734
  axes[1].bar(years, merged["sentiment"], color=s_clrs, edgecolor="white", width=0.6)
735
  axes[1].axhline(y=0, color="black", lw=0.8)
736
- axes[1].set_title("๐Ÿ“Š Ensemble Sentiment Index (normalized)", fontsize=10, fontweight="bold")
 
 
 
 
 
 
 
 
 
 
 
737
  axes[1].grid(True, alpha=0.3, axis="y")
738
 
739
- bars = axes[2].bar(["ARIMA(1,1,1)", "SARIMAX\n+Ensemble"], [rmse_a, rmse_s], color=["#FF5722" if rmse_a <= rmse_s else "#4CAF50", "#4CAF50" if rmse_s <= rmse_a else "#FF5722"], width=0.4, edgecolor="white")
 
 
 
 
 
 
 
 
740
  for bar, val in zip(bars, [rmse_a, rmse_s]):
741
- axes[2].text(bar.get_x()+bar.get_width()/2, bar.get_height()+0.01, f"{val:.4f}", ha="center", va="bottom", fontweight="bold", fontsize=11)
742
- axes[2].set_title("๐Ÿ“‰ RMSE Comparison (lower = better)", fontsize=11); axes[2].grid(True, alpha=0.3, axis="y")
 
 
 
 
743
 
 
744
  axes[3].axis("off")
745
  test_data = [
746
  ["Test", "Result", "Interpretation"],
747
- ["Granger (ADF + Granger)", "โœ… PASS" if granger_pass else "โŒ FAIL", "Sentiment Granger-causes Target" if granger_pass else "No causal link"],
748
- ["Diebold-Mariano", "โœ… PASS" if dm_pass else "โŒ FAIL", "SARIMAX significantly better" if dm_pass else f"n_test={len(test_y)} โ€” limited power"],
 
 
 
 
 
 
 
 
749
  ]
750
- tbl4 = axes[3].table(cellText=test_data[1:], colLabels=test_data[0], cellLoc="center", loc="center", colWidths=[0.35, 0.2, 0.45])
 
 
 
751
  tbl4.auto_set_font_size(False); tbl4.set_fontsize(11); tbl4.scale(1, 2.5)
752
  for (row, col), cell in tbl4.get_celld().items():
753
- if row == 0: cell.set_facecolor("#1565C0"); cell.set_text_props(color="white", fontweight="bold")
754
- elif row in [1, 2]: cell.set_facecolor("#E8F5E9" if (granger_pass if row==1 else dm_pass) else "#FFEBEE")
755
- axes[3].set_title("๐Ÿ”ฌ Statistical Tests: ADF + Granger + DM", fontsize=12, fontweight="bold", pad=20)
 
 
 
 
 
 
 
 
756
 
757
  plt.tight_layout(pad=3.0)
758
  img_path = "/tmp/forecast_plot.png"
759
- plt.savefig(img_path, dpi=130, bbox_inches="tight"); plt.close(fig)
 
760
 
 
 
 
761
  sent_table = ""
762
  if df_files is not None and len(df_files) > 0:
763
- sent_table = "\n---\n### ๐Ÿ“„ Ensemble Sentiment per File\n| ๐Ÿ“„ File | ๐Ÿ“… Year | ๐Ÿ˜Š Score | ๐Ÿ“ฆ Chunks | Label |\n|---|---|---|---|---|\n"
764
- for _, row in df_files.iterrows(): sent_table += f"| `{row['file']}` | {row['year']} | `{row['sentiment']:+.4f}` | {row['n_chunks']} | {row['label']} |\n"
 
 
 
 
 
 
 
765
 
766
  result_md = (
767
- f"## ๐Ÿ“Š Forecast Results โ€” {country_code} / {target_var}\n\n"
768
- f"| | |\n|---|---|\n| ๐ŸŽฏ Target | **{target_var}** |\n| ๐Ÿ“ˆ Train | **{split}** samples |\n| ๐Ÿงช Test (n) | **{len(test_y)}** samples |\n\n"
769
- f"---\n### ๐Ÿ† Model Comparison\n| Model | RMSE | MAE | MAPE |\n|---|---|---|---|\n"
770
- f"| ARIMA(1,1,1) | `{rmse_a:.4f}` | `{mae_a:.4f}` | `{mape_a:.1f}%` |\n"
771
- f"| SARIMAX+Ens | `{rmse_s:.4f}` | `{mae_s:.4f}` | `{mape_s:.1f}%` |\n"
772
- f"| **Improvement**| **{impr_rmse:+.1f}%** | **{impr_mae:+.1f}%** | **{impr_mape:+.1f}%** |\n\n"
 
 
 
 
 
 
773
  f"---\n{granger_md}\n\n---\n{dm_md}\n{sent_table}"
774
  )
775
  return result_md, img_path
776
 
777
  # ============================================================
778
- # UTILS & UI (Gradio)
779
  # ============================================================
780
  def generate_report(text, sent, conf, md):
781
  path = "/tmp/report.md"
782
- with open(path, "w", encoding="utf-8") as f: f.write(f"# Report\n\n**Input:** {text}\n**Sentiment:** {sent}\n\n{md}")
 
783
  return path
 
784
  def export_chat(history):
785
  path = "/tmp/chat.txt"
786
  with open(path, "w", encoding="utf-8") as f:
787
- for turn in history: f.write(f"{turn['role']}:\n{turn['content']}\n\n")
 
788
  return path
789
- def get_stats(): return f"### ๐Ÿ“Š Stats\nQuestions: {CHAT_STATS['questions']} | Chunks: {len(KB_TEXTS)}"
 
 
 
 
 
 
 
 
 
 
790
  def get_top_keywords():
791
- if not KB_TEXTS: return "_No files_"
792
- top = Counter(w for w in re.findall(r"\b\w{4,}\b", " ".join(KB_TEXTS).lower()) if w not in {"this","that","with","from"}).most_common(20)
793
- return "### ๐Ÿ”‘ Top Keywords\n" + "\n".join(f"- **{w}**: {c}" for w,c in top)
794
- def update_threshold(val): global MIN_SIMILARITY; MIN_SIMILARITY = val; return f"Threshold: {val:.0%}"
 
 
 
 
 
 
 
 
 
 
795
  def chat_text(message, history):
796
  if not message.strip(): return "", history
797
  answer, _ = smart_answer(message, history)
798
- return "", history + [{"role": "user", "content": message}, {"role": "assistant", "content": answer}]
 
 
 
 
799
  def tts_save(text, lang="en"):
800
  path = "/tmp/ans.mp3"
801
- gTTS(text=re.sub(r"[*`#>\[\]|_]", "", text)[:600], lang="ar" if lang=="ar" else "en").save(path)
 
 
 
802
  return path
 
803
  def chat_voice(audio, history):
804
- if audio is None: raise gr.Error("No audio.")
805
  sr, y = audio
806
  y = np.array(y) if isinstance(y, list) else y
807
  if y.ndim > 1: y = y.mean(axis=1)
808
  transcript = asr({"array": y.astype(np.float32), "sampling_rate": sr})["text"]
809
- lang = detect_lang(transcript)
810
- answer, _ = smart_answer(transcript, history)
811
- return history + [{"role": "user", "content": f"๐ŸŽ™๏ธ {transcript}"}, {"role": "assistant", "content": answer}], tts_save(answer, lang), transcript
 
 
 
 
 
 
 
 
 
 
 
 
 
812
 
813
- with gr.Blocks(title="RAG + Sentiment", theme=gr.themes.Soft()) as app:
814
- gr.Markdown("# ๐Ÿค– Multilingual RAG + Ensemble Sentiment + Economic Forecast")
815
-
816
  with gr.Tab("๐Ÿ“ 1 ยท Upload"):
817
- files = gr.File(label="๐Ÿ“‚ Files", file_types=[".pdf",".txt",".csv",".docx"], file_count="multiple", type="filepath")
818
- build_btn, status = gr.Button("๐Ÿ”จ Build Index", variant="primary"), gr.Markdown("_No index yet._")
819
- save_btn, load_btn, persist_status = gr.Button("๐Ÿ’พ Save"), gr.Button("๐Ÿ”„ Load"), gr.Markdown()
820
- build_btn.click(build_index, inputs=files, outputs=status)
821
- save_btn.click(save_index, outputs=persist_status); load_btn.click(load_saved_index, outputs=persist_status)
822
-
823
- with gr.Tab("๐ŸŽญ 2 ยท Sentiment"):
824
- inp = gr.Textbox(lines=2, label="๐Ÿ“ Input")
825
- run_btn = gr.Button("๐Ÿ” Analyze", variant="primary")
826
- out_sent, out_conf, out_full = gr.Textbox(label="Sentiment"), gr.Number(label="Score"), gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
827
  run_btn.click(predict_with_rag, inputs=inp, outputs=[out_sent, out_conf, out_full])
 
828
 
829
- with gr.Tab("๐Ÿ’ฌ 3 ยท Chatbot"):
830
- chatbot = gr.Chatbot(height=430, type="messages")
831
- msg = gr.Textbox(placeholder="Questionโ€ฆ", label="๐Ÿ’ฌ")
832
- msg.submit(chat_text, inputs=[msg,chatbot], outputs=[msg,chatbot])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
833
 
 
834
  with gr.Tab("๐Ÿ“ˆ 7 ยท Forecast"):
835
- gr.Markdown("## ๐Ÿ“ˆ ARIMA vs SARIMAX + Ensemble Sentiment\n**n_test = 3** (Captures recent turbulent years)")
 
 
 
836
  with gr.Row():
837
- country_input = gr.Textbox(value="DZ", label="๐ŸŒ Country Code")
838
- target_input = gr.Dropdown(choices=["Inflation (CPI %)", "GDP Growth (%) ", "Unemployment (%) ", "Exchange Rate"], value="Inflation (CPI %)", label="๐ŸŽฏ Target Variable")
 
 
 
 
 
 
 
 
 
 
 
 
839
  with gr.Row():
840
- start_year = gr.Slider(minimum=1990, maximum=2020, value=2000, step=1, label="๐Ÿ“… Start Year")
841
- end_year = gr.Slider(minimum=2010, maximum=2024, value=2023, step=1, label="๐Ÿ“… End Year")
842
- forecast_btn = gr.Button("๐Ÿ“ˆ Run Forecast", variant="primary", size="lg")
843
- forecast_result, forecast_plot = gr.Markdown(), gr.Image(label="๐Ÿ“ˆ Chart", type="filepath")
844
- forecast_btn.click(run_economic_forecast, inputs=[country_input, target_input, start_year, end_year], outputs=[forecast_result, forecast_plot])
 
 
 
 
 
 
 
 
 
845
 
846
  app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
 
34
  # ============================================================
35
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
36
  groq_client = Groq(api_key=GROQ_API_KEY)
37
+ print(f"DEBUG โ€” Groq Key loaded: {bool(GROQ_API_KEY)}")
38
 
39
  # ============================================================
40
  # GLOBAL STATE
 
54
  FAISS_PATH = f"{PERSIST_DIR}/faiss.index"
55
  os.makedirs(PERSIST_DIR, exist_ok=True)
56
 
57
+ # ============================================================
58
+ # PERSIST
59
+ # ============================================================
60
  def save_index():
61
  if FAISS_INDEX is None or not KB_TEXTS:
62
  return "โš ๏ธ No index to save."
 
153
  }
154
  doc_type = max(scores, key=scores.get)
155
  confidence = round(scores[doc_type] / max(sum(scores.values()), 1), 2)
156
+ icons = {
157
  "economic":"๐Ÿ“Š Economic","medical":"๐Ÿฅ Medical",
158
  "legal":"โš–๏ธ Legal","academic":"๐ŸŽ“ Academic","general":"๐Ÿ“„ General",
159
  }
 
166
  }
167
 
168
  # ============================================================
169
+ # AI MODELS โ€” Ensemble: FinBERT 40% + XLM 30% + Lexicon 30%
170
  # ============================================================
171
  WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}
172
 
173
+ print("โณ Loading FinBERT...")
174
  try:
175
  finbert_pipe = pipeline(
176
  "text-classification",
 
233
 
234
  def run_sentiment(text: str):
235
  score = sentiment_score_numeric(text)
236
+ if score > 0.05: sent = "Positive ๐Ÿ˜Š"
237
  elif score < -0.05: sent = "Negative ๐Ÿ˜ž"
238
+ else: sent = "Neutral ๐Ÿ˜"
239
  return sent, round(min(abs(score), 1.0), 4)
240
 
241
  def run_sentiment_detailed(text: str) -> str:
 
251
  return (
252
  f"### ๐Ÿ† Ensemble Sentiment Breakdown\n\n"
253
  f"| Model | Score | Bar | Weight |\n|---|---|---|---|\n"
254
+ f"| ๐Ÿฆ FinBERT | `{fb:+.4f}` | {bar(fb)} | **40%** |\n"
255
+ f"| ๐ŸŒ XLM-RoBERTa | `{xlm:+.4f}` | {bar(xlm)} | **30%** |\n"
256
+ f"| ๐Ÿ“– Lexicon | `{lex:+.4f}` | {bar(lex)} | **30%** |\n"
257
+ f"| โšก **Final** | **`{final:+.4f}`** | {bar(final)} | **100%** |\n\n"
258
  f"{label}"
259
  )
260
 
261
+ # ============================================================
262
+ # EMBEDDING + RERANKER + ASR
263
+ # ============================================================
264
+ print("โณ Loading Embedder, Reranker, ASR...")
265
  embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
266
  reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
267
+ asr = hf_pipeline(
268
+ "automatic-speech-recognition",
269
+ model="openai/whisper-small",
270
+ device=0 if torch.cuda.is_available() else -1,
271
+ )
272
  _ = embedder.encode(["warmup"], convert_to_numpy=True)
273
  print("โœ… All models loaded!")
274
 
275
  _startup = load_saved_index()
276
+ print(f"๐Ÿ”„ Startup load: {_startup}")
277
 
278
  # ============================================================
279
  # RAG CORE
280
  # ============================================================
281
+ def clean_filename(path: str) -> str:
282
+ return os.path.basename(str(path))
283
+
284
  def detect_lang(text: str) -> str:
285
+ try:
286
+ return "ar" if str(detect(str(text)[:300])).startswith("ar") else "en"
287
+ except:
288
+ return "en"
289
 
290
  def extract_year_from_filename(filename: str):
291
  full_path = str(filename).replace("\\", "/")
292
  for part in reversed(full_path.split("/")):
293
  m = re.findall(r"\b(20\d{2}|19\d{2})\b", part)
294
  if m: return int(m[0])
295
+ for pat in [r'WEO[_\-\s]?(\d{4})', r'BOA[_\-\s]?(\d{4})',
296
+ r'IMF[_\-\s]?(\d{4})', r'rapport[_\-\s]?(\d{4})',
297
+ r'report[_\-\s]?(\d{4})']:
298
  m = re.search(pat, full_path, re.IGNORECASE)
299
  if m: return int(m.group(1))
300
  all_y = re.findall(r'\b(19\d{2}|20\d{2})\b', full_path)
 
340
  from docx import Document
341
  doc = Document(path)
342
  pars = [p.text for p in doc.paragraphs if p.text.strip()]
343
+ return [{"text": "\n".join(pars[i:i+50]), "page": i//50+1}
344
+ for i in range(0, len(pars), 50)] or [{"text":"Empty DOCX.","page":1}]
345
+ except Exception as e:
346
+ return [{"text": f"DOCX error: {e}", "page": 1}]
347
  if path.endswith(".csv"):
348
  df = pd.read_csv(path)
349
  col = "text" if "text" in df.columns else df.columns[0]
350
+ return [{"text": t, "page": i+1}
351
+ for i, t in enumerate(df[col].dropna().astype(str))]
352
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
353
  return [{"text": f.read(), "page": 1}]
354
 
 
359
  file_paths = []
360
  if not isinstance(files, list): files = [files]
361
  for f in files:
362
+ if isinstance(f, str): file_paths.append(f)
363
+ elif isinstance(f, dict): file_paths.append(f.get("path") or f.get("name") or str(f))
364
+ elif hasattr(f, "name"): file_paths.append(f.name)
365
+ else: file_paths.append(str(f))
366
 
367
  for p in file_paths:
368
  full_path = str(p)
369
  fname = clean_filename(full_path)
370
  year = extract_year_from_filename(fname) or extract_year_from_filename(full_path)
371
+ pages = load_file(full_path)
372
+ file_texts = []
373
  for pg in pages:
374
  for ch in chunk_text(pg["text"]):
375
  KB_TEXTS.append(ch)
376
+ KB_META.append({"name": fname, "lang": detect_lang(ch),
377
+ "page": pg["page"], "year": year})
378
  file_texts.append(ch)
379
  ti = detect_document_type(file_texts)
380
  ti["year"] = year
381
  PER_FILE_INFO[fname] = ti
382
 
383
  if not KB_TEXTS: raise gr.Error("โš ๏ธ No text extracted.")
384
+ KB_EMB = embedder.encode(
385
+ KB_TEXTS, convert_to_numpy=True,
386
+ normalize_embeddings=True, show_progress_bar=False
387
+ ).astype("float32")
388
  FAISS_INDEX = faiss.IndexFlatIP(KB_EMB.shape[1])
389
  FAISS_INDEX.add(KB_EMB)
390
  DOC_TYPE_INFO = detect_document_type(KB_TEXTS)
 
399
  tbl += f"| `{fname}` | {yrb} | {info['type']}{badge} | {info['confidence']:.0%} | {n} |\n"
400
 
401
  ef = [f for f,i in PER_FILE_INFO.items() if i["is_economic"]]
402
+ fmsg = (
403
+ f"\n\n๐ŸŸข **Economic files detected:** " +
404
+ ", ".join(f"`{f}`" for f in ef) +
405
+ "\nโžก๏ธ Go to **๐Ÿ“ˆ 7 ยท Forecast** tab to run predictions."
406
+ ) if ef else ""
407
  save_index()
408
+ return (
409
+ f"โœ… **Index built!**\n\n"
410
+ f"| | |\n|---|---|\n"
411
+ f"| ๐Ÿ“ฆ Total chunks | **{len(KB_TEXTS):,}** |\n"
412
+ f"| ๐Ÿ“„ Files | **{len(file_paths)}** |\n"
413
+ f"| ๐Ÿ‡ธ๐Ÿ‡ฆ Arabic | **{lang_count.get('ar',0):,}** |\n"
414
+ f"| ๐Ÿ‡บ๐Ÿ‡ธ English | **{lang_count.get('en',0):,}** |\n\n"
415
+ f"---\n### ๐Ÿ“‹ Per-File Analysis\n\n{tbl}{fmsg}"
416
+ )
417
 
418
  def bm25_score(query_terms, doc, k1=1.5, b=0.75, avg_dl=200):
419
  try:
420
  if not KB_TEXTS or not isinstance(doc, str): return 0.0
421
+ dl, score = len(doc.split()), 0.0
422
+ df = Counter(doc.lower().split())
423
  for term in query_terms:
424
  if not isinstance(term, str) or not term: continue
425
+ tl = term.lower()
426
  n_doc = sum(1 for t in KB_TEXTS if isinstance(t,str) and tl in t.lower())
427
+ tf = df.get(tl, 0)
428
+ idf = math.log((len(KB_TEXTS)+1)/(1+n_doc))
429
  score += idf*(tf*(k1+1))/(tf+k1*(1-b+b*dl/max(avg_dl,1)))
430
  return score
431
  except: return 0.0
 
433
  def rag_retrieve(query, k=5, top_n=3):
434
  if FAISS_INDEX is None or not KB_TEXTS: return []
435
  try:
436
+ q_emb = embedder.encode(
437
+ [query], convert_to_numpy=True, normalize_embeddings=True
438
+ ).astype("float32")
439
  scores, idx = FAISS_INDEX.search(q_emb, min(k*3, len(KB_TEXTS)))
440
  candidates, qterms = [], [t for t in re.findall(r"\w+", str(query).lower()) if t]
441
  for rank, i in enumerate(idx[0]):
442
  if i == -1: continue
443
+ sem = float(scores[0][rank])
444
  if sem < MIN_SIMILARITY: continue
445
  text = KB_TEXTS[i]
446
  if not isinstance(text, str): continue
447
  kw = bm25_score(qterms, text)
448
  lterms = [t for t in qterms if len(t) > 2]
449
+ try:
450
+ exact = all(re.search(rf"\b{re.escape(t)}\b", text.lower()) for t in lterms) if lterms else False
451
  except: exact = False
452
  hybrid = sem*0.6 + min(kw/10, 0.4) + (0.15 if exact else 0.0)
453
  candidates.append({
454
  "idx": i, "sem": sem, "kw": kw, "exact": exact, "hybrid": hybrid,
455
  "lang": KB_META[i]["lang"], "file": KB_META[i]["name"],
456
+ "page": KB_META[i]["page"], "year": KB_META[i].get("year"),
457
+ "text": text,
458
  })
459
  if not candidates: return []
460
  ce_scores = reranker.predict([[query, c["text"]] for c in candidates])
 
469
  return []
470
 
471
  def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
472
+ n = len(texts)
473
  econ = [t for t in texts if any(kw in t.lower() for kw in ECON_TRIGGER)]
474
  if len(econ) < 10:
475
  start = texts[:min(10, n)]
 
479
  if len(econ) > max_chunks:
480
  step = max(1, len(econ) // max_chunks)
481
  sample = econ[::step][:max_chunks]
482
+ else:
483
+ sample = econ
484
  return sample
485
 
486
  def llm_groq(question, rag_context, history, lang):
 
493
  "- Be concise, helpful, accurate."
494
  )
495
  messages = [{"role": "system", "content": system_prompt}]
496
+ for turn in history[-4:]:
497
+ messages.append({"role": turn["role"], "content": turn["content"]})
498
  user_content = f"๐Ÿ“„ Context:\n{rag_context}\n\nQuestion: {question}" if rag_context else question
499
  messages.append({"role": "user", "content": user_content})
500
  try:
501
+ r = groq_client.chat.completions.create(
502
+ model="llama-3.3-70b-versatile",
503
+ messages=messages,
504
+ temperature=0.3,
505
+ max_tokens=512,
506
+ )
507
  return r.choices[0].message.content.strip()
508
+ except Exception as e:
509
+ return f"โš ๏ธ Groq error: {e}"
510
 
511
  def smart_answer(question, history):
512
+ lang = detect_lang(question)
513
  results = rag_retrieve(question, k=5, top_n=3)
514
  rag_context = ""
515
  if results:
516
+ for r in results:
517
+ rag_context += f"[Source: {r['file']} - Page {r['page']}]\n{r['text']}\n\n"
518
  has_good_rag = bool(results) and results[0]["sem"] >= 0.25
519
+ answer_text = llm_groq(question, rag_context[:2000], history, lang)
520
  if has_good_rag:
521
+ src = ", ".join(f"`{r['file']}` p.{r['page']}" for r in results)
522
  badge = f"\n\n๐Ÿ“„ **{'ุงู„ู…ุตุฏุฑ' if lang=='ar' else 'Source'}:** {src}"
523
  CHAT_STATS["found"] += 1
524
  else:
 
530
  def predict_with_rag(text):
531
  text = "" if text is None else str(text).strip()
532
  if not text: raise gr.Error("โš ๏ธ Enter text first.")
533
+ lang = detect_lang(text)
534
+ qterms = [t for t in re.findall(r"\w+", text.lower()) if len(t) > 2]
535
  exact_hits = []
536
  for i, chunk in enumerate(KB_TEXTS):
537
  if not isinstance(chunk, str): continue
 
541
  if re.search(rf"\b{re.escape(term)}\b", cl):
542
  for s in re.split(r"(?<=[.!?ุŸ\n])\s+", chunk):
543
  if re.search(rf"\b{re.escape(term)}\b", s.lower()):
544
+ exact_hits.append({
545
+ "word": term, "file": KB_META[i]["name"],
546
+ "sentence": s.strip(), "lang": KB_META[i]["lang"],
547
+ "chunk_id": i, "page": KB_META[i]["page"],
548
+ })
549
  except: continue
550
+
551
  sem_results, md = rag_retrieve(text, k=5, top_n=3), ""
552
  if exact_hits:
553
  seen, unique = set(), []
 
566
  k2 = (h["file"], h["chunk_id"])
567
  if k2 in seen2: continue
568
  seen2.add(k2)
569
+ md += f"### ๐Ÿ“„ `{h['file']}` โ€” p.{h['page']} {'๐Ÿ‡ธ๐Ÿ‡ฆ' if h['lang']=='ar' else '๐Ÿ‡บ๐Ÿ‡ธ'}\n\n```\n{KB_TEXTS[h['chunk_id']]}\n```\n\n"
570
  else:
571
  sent, conf = "โŒ Not found", 0.0
572
+ if lang == "ar":
573
+ md += f"## โŒ ุงู„ูƒู„ู…ุฉ ุบูŠุฑ ู…ูˆุฌูˆุฏุฉ\n\n**`{text}`** ู„ู… ุชูุฐูƒุฑ ุญุฑููŠุงู‹.\n\n"
574
+ else:
575
+ md += f"## โŒ Word Not Found\n\n**`{text}`** not found literally.\n\n"
576
+
577
  if sem_results:
578
  md += "---\n## ๐Ÿ” Semantic Results\n\n"
579
  for r in sem_results:
580
+ bar = "๐ŸŸฉ"*round(r["sem"]*10) + "โฌœ"*(10-round(r["sem"]*10))
581
  snippet = r["text"][:300].strip()
582
  for t in qterms:
583
  try: snippet = re.sub(rf"(?i)({re.escape(t)})", r"**\1**", snippet)
584
  except: pass
585
+ md += (
586
+ f"### Result {r['rank']} โ€” {bar} `{r['sem']*100:.1f}%` "
587
+ f"{'๐Ÿ‡ธ๐Ÿ‡ฆ' if r['lang']=='ar' else '๐Ÿ‡บ๐Ÿ‡ธ'}\n\n"
588
+ f"๐Ÿ“„ `{r['file']}` p.{r['page']}\n\n> {snippet}...\n\n"
589
+ )
590
+ else:
591
+ md += "---\n_No similar content found._\n"
592
  return sent, round(conf, 4), md
593
 
594
  # ============================================================
595
+ # ECONOMETRICS โ€” World Bank + ARIMA/SARIMAX
596
  # ============================================================
597
  def get_worldbank_data(country_code, indicator, start_year, end_year):
598
+ url = (
599
+ f"https://api.worldbank.org/v2/country/{country_code}/"
600
+ f"indicator/{indicator}?date={start_year}:{end_year}&per_page=100&format=json"
601
+ )
602
  try:
603
  resp = requests.get(url, timeout=15)
604
  resp.raise_for_status()
605
  data = resp.json()
606
  if not data or len(data) < 2 or not data[1]: return pd.DataFrame()
607
+ rows = [
608
+ {"year": int(e["date"]), "value": float(e["value"])}
609
+ for e in data[1]
610
+ if e.get("value") is not None and e.get("date") is not None
611
+ ]
612
  return pd.DataFrame(rows).dropna().sort_values("year").reset_index(drop=True)
613
  except Exception as e:
614
  print(f"World Bank error: {e}")
 
625
  sample = get_economic_chunks(texts, max_chunks=40)
626
  scores = [sentiment_score_numeric(t) for t in sample]
627
  avg = round(float(np.mean(scores)), 4)
628
+ year = next(
629
+ (m["year"] for m in KB_META if m["name"]==fname and m.get("year")), None
630
+ )
631
  file_results.append({
632
+ "file": fname, "year": year if year else "N/A",
633
+ "sentiment": avg, "n_chunks": len(sample),
634
+ "label": "๐ŸŸข Optimistic" if avg > 0.05 else "๐Ÿ”ด Pessimistic" if avg < -0.05 else "๐ŸŸก Neutral",
635
  })
636
+ if year:
637
+ yearly_sentiment.setdefault(year, []).append(avg)
638
 
639
+ yearly_avg = {
640
+ yr: round(float(np.mean(vals)), 4)
641
+ for yr, vals in yearly_sentiment.items()
642
+ }
643
  df_files = pd.DataFrame(file_results).sort_values("year")
644
+ df_yearly = (
645
+ pd.DataFrame([{"year": y, "sentiment": s} for y, s in sorted(yearly_avg.items())])
646
+ if yearly_avg else None
647
+ )
648
  return df_files, df_yearly
649
 
650
  def run_adf_check(series: np.ndarray, name: str):
 
652
  def adf_p(s):
653
  try: return adfuller(s, autolag='AIC')[1]
654
  except: return 1.0
 
655
  s = series.copy()
656
  p0 = adf_p(s)
657
+ if p0 <= 0.05:
658
+ return s, f"โœ… Stationary at level (p={p0:.4f})", False
659
  s1 = np.diff(s)
660
  p1 = adf_p(s1)
661
+ if p1 <= 0.05:
662
+ return s1, f"โš ๏ธ Non-stationary (p={p0:.4f}) โ†’ 1st diff โ†’ โœ… stationary (p={p1:.4f})", True
663
  s2 = np.diff(s1)
664
  p2 = adf_p(s2)
665
+ return (
666
+ s2,
667
+ f"โš ๏ธ Non-stationary (p={p0:.4f}) โ†’ 1st diff (p={p1:.4f}) โ†’ 2nd diff โ†’ "
668
+ f"{'โœ… stationary' if p2<=0.05 else 'โš ๏ธ non-stationary'} (p={p2:.4f})",
669
+ True,
670
+ )
671
 
672
  def run_granger_test(series_y, series_exog, maxlag=4):
673
  try:
674
  from statsmodels.tsa.stattools import grangercausalitytests
675
+ if len(series_y) < 10:
676
+ return "โš ๏ธ **Granger Test skipped** โ€” need โ‰ฅ 10 points.", False
677
  sy, status_y = run_adf_check(series_y.copy(), "Target")[:2]
678
  sexog, status_exog = run_adf_check(series_exog.copy(), "Sentiment")[:2]
 
679
  min_len = min(len(sy), len(sexog))
680
  sy, sexog = sy[-min_len:], sexog[-min_len:]
681
  maxlag = min(maxlag, max(1, (len(sy) - 1) // 3))
682
+ if len(sy) < 5:
683
+ return "โš ๏ธ **Granger Test skipped** โ€” too few obs after differencing.", False
684
+ gc_result = grangercausalitytests(
685
+ np.column_stack([sy, sexog]), maxlag=maxlag, verbose=False
686
+ )
687
  rows, any_pass, best_p = [], False, 1.0
688
  for lag, res in gc_result.items():
689
  p_val = res[0]["ssr_ftest"][1]
690
+ f_val = res[0]["ssr_ftest"][0]
691
+ if p_val < 0.05: sig = "โœ… Yes"; any_pass = True
692
+ elif p_val < 0.10: sig = "๐Ÿ”ถ Marginal"
693
+ else: sig = "โŒ No"
694
  best_p = min(best_p, p_val)
695
+ rows.append(f"| {lag} | {f_val:.4f} | {p_val:.4f} | {sig} |")
696
 
697
  table = (
698
  "### ๐Ÿ”ฌ Granger Causality Test\n"
699
  "*Hโ‚€: Sentiment does NOT Granger-cause Target*\n\n"
700
+ f"#### ๐Ÿ“‹ ADF Stationarity Pre-check\n\n"
701
+ f"| Series | ADF Result |\n|---|---|\n"
702
+ f"| ๐ŸŽฏ Target | {status_y} |\n"
703
+ f"| ๐Ÿ˜Š Sentiment | {status_exog} |\n\n"
704
+ "#### ๐Ÿ“Š Granger Results\n\n"
705
+ "| Lag | F-stat | p-value | Significant? |\n|-----|--------|---------|-------------|\n"
706
+ + "\n".join(rows)
707
  )
708
+ if any_pass:
709
+ verdict = f"\n\nโœ… **PASS** โ€” Sentiment significantly Granger-causes the target (p < 0.05)."
710
+ elif best_p < 0.10:
711
+ verdict = f"\n\n๐Ÿ”ถ **MARGINAL** โ€” best p = {best_p:.4f} (< 0.10)."
712
+ else:
713
+ verdict = "\n\nโŒ **FAIL** โ€” No significant Granger causality (p โ‰ฅ 0.05)."
714
  return table + verdict, any_pass
715
+ except Exception as e:
716
+ return f"โš ๏ธ Granger test error: `{e}`\n", False
717
 
718
  def run_dm_test(actual, pred_arima, pred_sarimax):
719
  try:
720
  n = len(actual)
721
+ if n < 3:
722
+ return "โš ๏ธ **DM Test skipped** โ€” n < 3.", False
723
+ d = (actual - pred_arima)**2 - (actual - pred_sarimax)**2
724
+ d_mean = np.mean(d)
725
+ d_std = np.std(d, ddof=1)
726
+ if d_std < 1e-10:
727
+ return "โš ๏ธ **DM Test** โ€” models identical.", False
728
  dm_stat = d_mean / (d_std / np.sqrt(n))
729
  p_val = 2 * (1 - stats.t.cdf(abs(dm_stat), df=n - 1))
730
  sig = "โœ… Yes" if p_val < 0.05 else ("๐Ÿ”ถ Marginal" if p_val < 0.10 else "โŒ No")
 
732
 
733
  table = (
734
  "### ๐ŸŽฏ Diebold-Mariano Test\n"
735
+ "*Hโ‚€: Equal predictive accuracy | Hโ‚: SARIMAX better than ARIMA*\n\n"
736
  "| DM Statistic | p-value | n (test) | Significant? | Better Model |\n"
737
  "|-------------|---------|----------|-------------|-------------|\n"
738
  f"| `{dm_stat:.4f}` | `{p_val:.4f}` | `{n}` | {sig} | **{better}** |\n"
739
  )
740
  passed = p_val < 0.05 and dm_stat > 0
741
+ if passed:
742
+ verdict = "\nโœ… **PASS** โ€” SARIMAX+Ensemble is **significantly better** (p < 0.05)."
743
+ elif (p_val < 0.10) and dm_stat > 0:
744
+ verdict = f"\n๐Ÿ”ถ **MARGINAL** โ€” p = {p_val:.4f} (< 0.10)."
745
+ else:
746
+ verdict = (
747
+ f"\nโŒ **FAIL** โ€” Not statistically significant (p = {p_val:.4f}).\n\n"
748
+ f"> ๐Ÿ’ก With n = {n} test points, power is limited. "
749
+ f"Expand Start Year to 1990 for more test data."
750
+ )
751
  return table + verdict, passed
752
+ except Exception as e:
753
+ return f"โš ๏ธ DM error: `{e}`\n", False
754
 
755
+ # ============================================================
756
+ # MAIN FORECAST FUNCTION โ€” n = 3
757
+ # ============================================================
758
  def run_economic_forecast(country_code, target_var, start_year, end_year):
759
  try:
760
  from statsmodels.tsa.arima.model import ARIMA
761
  from statsmodels.tsa.statespace.sarimax import SARIMAX
762
  from sklearn.metrics import mean_squared_error, mean_absolute_error
763
+ except ImportError:
764
+ return "โŒ pip install statsmodels scikit-learn", None
765
 
766
  indicator_map = {
767
  "Inflation (CPI %)" : "FP.CPI.TOTL.ZG",
 
769
  "Unemployment (%) ": "SL.UEM.TOTL.ZS",
770
  "Exchange Rate" : "PA.NUS.FCRF",
771
  }
772
+ econ_df = get_worldbank_data(
773
+ country_code,
774
+ indicator_map.get(target_var, "FP.CPI.TOTL.ZG"),
775
+ int(start_year), int(end_year),
776
+ )
777
+ if econ_df.empty:
778
+ return f"โŒ No data for **{country_code}** / **{target_var}**", None
779
+ if len(econ_df) < 5:
780
+ return f"โš ๏ธ Only **{len(econ_df)}** data points. Widen year range.", None
781
 
782
  df_files, df_yearly = build_doc_sentiment_index()
783
 
784
  if df_yearly is not None and len(df_yearly) >= 2:
785
+ merged = econ_df.merge(df_yearly, on="year", how="left")
786
+ merged["sentiment"] = merged["sentiment"].fillna(
787
+ float(df_yearly["sentiment"].mean())
788
+ )
789
+ has_yearly = True
790
+ mode_msg = "โœ… **Yearly Ensemble Sentiment**"
791
  else:
792
+ global_sent = (
793
+ float(pd.to_numeric(df_files["sentiment"], errors="coerce").mean())
794
+ if df_files is not None and len(df_files) > 0 else 0.0
795
+ )
796
  merged = econ_df.copy()
797
  merged["sentiment"] = global_sent
798
+ has_yearly = False
799
+ mode_msg = "โš ๏ธ **Global Sentiment**"
800
 
801
  if merged["sentiment"].std() > 1e-6:
802
  scaler = MinMaxScaler(feature_range=(-0.3, 0.3))
803
+ merged["sentiment"] = scaler.fit_transform(
804
+ merged["sentiment"].values.reshape(-1, 1)
805
+ ).flatten().round(4)
806
 
807
  series = merged["value"].values.astype(float)
808
  exog = merged["sentiment"].values.reshape(-1, 1)
809
  years = merged["year"].values
810
  n = len(series)
811
 
812
+ # ============================================================
813
+ # โœ… n = 3 โ€” Test on last 3 years
814
+ # ============================================================
815
  split = n - 3
816
+ if split < 5:
817
+ split = max(int(n * 0.75), 5) # safety fallback for very short series
818
 
819
+ train_y, test_y = series[:split], series[split:]
820
  train_exog, test_exog = exog[:split], exog[split:]
821
  test_years = years[split:]
822
 
823
+ # ARIMA baseline
824
  try:
825
+ m1 = ARIMA(train_y, order=(1,1,1)).fit()
826
  pred_arima = m1.forecast(len(test_y))
827
  rmse_a = float(np.sqrt(mean_squared_error(test_y, pred_arima)))
828
  mae_a = float(mean_absolute_error(test_y, pred_arima))
829
  mape_a = float(np.mean(np.abs((test_y-pred_arima)/np.maximum(np.abs(test_y),1e-8)))*100)
830
+ except Exception as e:
831
+ return f"โŒ ARIMA error: {e}", None
832
 
833
+ # SARIMAX + Ensemble Sentiment
834
  try:
835
+ m2 = SARIMAX(train_y, exog=train_exog, order=(1,1,1)).fit(disp=False)
836
  pred_sarimax = m2.forecast(len(test_y), exog=test_exog)
837
  rmse_s = float(np.sqrt(mean_squared_error(test_y, pred_sarimax)))
838
  mae_s = float(mean_absolute_error(test_y, pred_sarimax))
839
  mape_s = float(np.mean(np.abs((test_y-pred_sarimax)/np.maximum(np.abs(test_y),1e-8)))*100)
840
+ except Exception as e:
841
+ return f"โŒ SARIMAX error: {e}", None
842
 
843
  impr_rmse = (rmse_a - rmse_s) / rmse_a * 100
844
  impr_mae = (mae_a - mae_s) / mae_a * 100
845
  impr_mape = (mape_a - mape_s) / mape_a * 100
846
 
847
+ # Granger โ€” use full series
848
  if has_yearly and df_yearly is not None and len(df_yearly) >= 5:
849
  real_merged = econ_df.merge(df_yearly, on="year", how="inner")
850
+ gc_y = real_merged["value"].values.astype(float)
851
+ gc_exog = real_merged["sentiment"].values.astype(float)
852
  else:
853
+ gc_y = series
854
+ gc_exog = merged["sentiment"].values
855
 
856
  granger_md, granger_pass = run_granger_test(gc_y, gc_exog, maxlag=4)
857
  dm_md, dm_pass = run_dm_test(test_y, np.array(pred_arima), np.array(pred_sarimax))
858
 
859
+ # ============================================================
860
+ # PLOTS
861
+ # ============================================================
862
  fig, axes = plt.subplots(4, 1, figsize=(11, 18))
863
+
864
+ # Plot 1 โ€” Forecast
865
  axes[0].plot(years, series, "o-", color="#2196F3", label="Actual", lw=2, ms=5)
866
+ axes[0].plot(test_years, pred_arima, "s--", color="#FF5722", label="ARIMA(1,1,1)", lw=2)
867
  axes[0].plot(test_years, pred_sarimax, "^-.", color="#4CAF50", label="SARIMAX+Ensemble", lw=2)
868
  axes[0].axvline(x=years[split-1], color="gray", linestyle=":", alpha=0.7, label="Trainโ”‚Test")
869
+ axes[0].set_title(
870
+ f"๐Ÿ“ˆ {target_var} โ€” {country_code} (Yearly Ensemble Sentiment) | n_test={len(test_y)}",
871
+ fontsize=11, fontweight="bold",
872
+ )
873
+ axes[0].set_xlabel("Year"); axes[0].set_ylabel(target_var)
874
  axes[0].legend(fontsize=9); axes[0].grid(True, alpha=0.3)
875
 
876
+ # Plot 2 โ€” Sentiment Index
877
+ s_clrs = [
878
+ "#4CAF50" if s > 0.05 else "#FF5722" if s < -0.05 else "#FFC107"
879
+ for s in merged["sentiment"]
880
+ ]
881
  axes[1].bar(years, merged["sentiment"], color=s_clrs, edgecolor="white", width=0.6)
882
  axes[1].axhline(y=0, color="black", lw=0.8)
883
+ legend_patches = [
884
+ Patch(color="#4CAF50", label="Optimistic (>0.05)"),
885
+ Patch(color="#FFC107", label="Neutral"),
886
+ Patch(color="#FF5722", label="Pessimistic (<-0.05)"),
887
+ ]
888
+ axes[1].legend(handles=legend_patches, fontsize=8, loc="upper right")
889
+ axes[1].set_title(
890
+ "๐Ÿ“Š Ensemble Sentiment Index (FinBERT 40% + XLM 30% + Lexicon 30%)\n"
891
+ "per-year โ€” normalized [-0.3, +0.3]",
892
+ fontsize=10, fontweight="bold",
893
+ )
894
+ axes[1].set_xlabel("Year"); axes[1].set_ylabel("Sentiment Score")
895
  axes[1].grid(True, alpha=0.3, axis="y")
896
 
897
+ # Plot 3 โ€” RMSE Bar
898
+ better_color_a = "#4CAF50" if rmse_a <= rmse_s else "#4CAF50"
899
+ better_color_s = "#4CAF50" if rmse_s <= rmse_a else "#4CAF50"
900
+ bar_colors = ["#FF5722" if rmse_a > rmse_s else "#4CAF50",
901
+ "#4CAF50" if rmse_s <= rmse_a else "#FF5722"]
902
+ bars = axes[2].bar(
903
+ ["ARIMA(1,1,1)", "SARIMAX\n+Ensemble"],
904
+ [rmse_a, rmse_s], color=bar_colors, width=0.4, edgecolor="white",
905
+ )
906
  for bar, val in zip(bars, [rmse_a, rmse_s]):
907
+ axes[2].text(
908
+ bar.get_x()+bar.get_width()/2, bar.get_height()+0.01,
909
+ f"{val:.4f}", ha="center", va="bottom", fontweight="bold", fontsize=11,
910
+ )
911
+ axes[2].set_title("๐Ÿ“‰ RMSE Comparison (lower = better)", fontsize=11)
912
+ axes[2].set_ylabel("RMSE"); axes[2].grid(True, alpha=0.3, axis="y")
913
 
914
+ # Plot 4 โ€” Statistical Tests Summary Table
915
  axes[3].axis("off")
916
  test_data = [
917
  ["Test", "Result", "Interpretation"],
918
+ [
919
+ "Granger (ADF + Granger)",
920
+ "โœ… PASS" if granger_pass else "โŒ FAIL",
921
+ "Sentiment Granger-causes Target" if granger_pass else "No causal link detected",
922
+ ],
923
+ [
924
+ "Diebold-Mariano\n(SARIMAX vs ARIMA)",
925
+ "โœ… PASS" if dm_pass else "โŒ FAIL",
926
+ "SARIMAX significantly better" if dm_pass else f"n_test={len(test_y)} โ€” limited power",
927
+ ],
928
  ]
929
+ tbl4 = axes[3].table(
930
+ cellText=test_data[1:], colLabels=test_data[0],
931
+ cellLoc="center", loc="center", colWidths=[0.35, 0.2, 0.45],
932
+ )
933
  tbl4.auto_set_font_size(False); tbl4.set_fontsize(11); tbl4.scale(1, 2.5)
934
  for (row, col), cell in tbl4.get_celld().items():
935
+ if row == 0:
936
+ cell.set_facecolor("#1565C0")
937
+ cell.set_text_props(color="white", fontweight="bold")
938
+ elif row == 1:
939
+ cell.set_facecolor("#E8F5E9" if granger_pass else "#FFEBEE")
940
+ elif row == 2:
941
+ cell.set_facecolor("#E8F5E9" if dm_pass else "#FFEBEE")
942
+ axes[3].set_title(
943
+ "๐Ÿ”ฌ Statistical Tests: ADF + Granger + DM",
944
+ fontsize=12, fontweight="bold", pad=20,
945
+ )
946
 
947
  plt.tight_layout(pad=3.0)
948
  img_path = "/tmp/forecast_plot.png"
949
+ plt.savefig(img_path, dpi=130, bbox_inches="tight")
950
+ plt.close(fig)
951
 
952
+ # ============================================================
953
+ # RESULT TEXT
954
+ # ============================================================
955
  sent_table = ""
956
  if df_files is not None and len(df_files) > 0:
957
+ sent_table = (
958
+ "\n---\n### ๐Ÿ“„ Ensemble Sentiment per File\n"
959
+ "| ๐Ÿ“„ File | ๐Ÿ“… Year | ๐Ÿ˜Š Score | ๐Ÿ“ฆ Chunks | Label |\n|---|---|---|---|---|\n"
960
+ )
961
+ for _, row in df_files.iterrows():
962
+ sent_table += (
963
+ f"| `{row['file']}` | {row['year']} | "
964
+ f"`{row['sentiment']:+.4f}` | {row['n_chunks']} | {row['label']} |\n"
965
+ )
966
 
967
  result_md = (
968
+ f"## ๐Ÿ“Š Forecast โ€” {country_code} / {target_var}\n\n"
969
+ f"| | |\n|---|---|\n"
970
+ f"| ๐ŸŽฏ Target Variable | **{target_var}** |\n"
971
+ f"| ๐Ÿ“ˆ Sentiment Mode | {mode_msg} |\n"
972
+ f"| ๐Ÿ“ˆ Train samples | **{split}** |\n"
973
+ f"| ๐Ÿงช Test samples (n)| **{len(test_y)}** |\n\n"
974
+ f"---\n### ๐Ÿ† Model Comparison\n"
975
+ f"| Model | RMSE | MAE | MAPE |\n|---|---|---|---|\n"
976
+ f"| ARIMA(1,1,1) | `{rmse_a:.4f}` | `{mae_a:.4f}` | `{mape_a:.1f}%` |\n"
977
+ f"| SARIMAX+Ensemble | `{rmse_s:.4f}` | `{mae_s:.4f}` | `{mape_s:.1f}%` |\n"
978
+ f"| **Improvement** | **{impr_rmse:+.1f}%** | **{impr_mae:+.1f}%** | **{impr_mape:+.1f}%** |\n\n"
979
+ f"{'โœ… **Improved** by adding Ensemble Sentiment Index.' if impr_rmse > 0 else 'โš ๏ธ No RMSE improvement for this variable.'}\n\n"
980
  f"---\n{granger_md}\n\n---\n{dm_md}\n{sent_table}"
981
  )
982
  return result_md, img_path
983
 
984
  # ============================================================
985
+ # UTILITIES
986
  # ============================================================
987
  def generate_report(text, sent, conf, md):
988
  path = "/tmp/report.md"
989
+ with open(path, "w", encoding="utf-8") as f:
990
+ f.write(f"# Report\n\n**Input:** {text}\n**Sentiment:** {sent}\n\n{md}")
991
  return path
992
+
993
  def export_chat(history):
994
  path = "/tmp/chat.txt"
995
  with open(path, "w", encoding="utf-8") as f:
996
+ for turn in history:
997
+ f.write(f"{turn['role']}:\n{turn['content']}\n\n")
998
  return path
999
+
1000
+ def get_stats():
1001
+ return (
1002
+ f"### ๐Ÿ“Š Session Stats\n\n"
1003
+ f"| | |\n|---|---|\n"
1004
+ f"| โ“ Questions asked | **{CHAT_STATS['questions']}** |\n"
1005
+ f"| โœ… RAG answers | **{CHAT_STATS['found']}** |\n"
1006
+ f"| ๐Ÿค– General answers | **{CHAT_STATS['not_found']}** |\n"
1007
+ f"| ๐Ÿ“ฆ Chunks indexed | **{len(KB_TEXTS):,}** |\n"
1008
+ )
1009
+
1010
  def get_top_keywords():
1011
+ if not KB_TEXTS: return "_No files uploaded yet._"
1012
+ stopwords = {"this","that","with","from","have","been","were","they","their",
1013
+ "there","what","when","which","will","also","than","into","more"}
1014
+ top = Counter(
1015
+ w for w in re.findall(r"\b\w{4,}\b", " ".join(KB_TEXTS).lower())
1016
+ if w not in stopwords
1017
+ ).most_common(20)
1018
+ return "### ๐Ÿ”‘ Top 20 Keywords\n\n" + "\n".join(f"- **{w}**: {c}" for w,c in top)
1019
+
1020
+ def update_threshold(val):
1021
+ global MIN_SIMILARITY
1022
+ MIN_SIMILARITY = val
1023
+ return f"โœ… Threshold set to: {val:.0%}"
1024
+
1025
  def chat_text(message, history):
1026
  if not message.strip(): return "", history
1027
  answer, _ = smart_answer(message, history)
1028
+ return "", history + [
1029
+ {"role": "user", "content": message},
1030
+ {"role": "assistant", "content": answer},
1031
+ ]
1032
+
1033
  def tts_save(text, lang="en"):
1034
  path = "/tmp/ans.mp3"
1035
+ gTTS(
1036
+ text=re.sub(r"[*`#>\[\]|_]", "", text)[:600],
1037
+ lang="ar" if lang == "ar" else "en",
1038
+ ).save(path)
1039
  return path
1040
+
1041
  def chat_voice(audio, history):
1042
+ if audio is None: raise gr.Error("No audio received.")
1043
  sr, y = audio
1044
  y = np.array(y) if isinstance(y, list) else y
1045
  if y.ndim > 1: y = y.mean(axis=1)
1046
  transcript = asr({"array": y.astype(np.float32), "sampling_rate": sr})["text"]
1047
+ lang = detect_lang(transcript)
1048
+ answer, _ = smart_answer(transcript, history)
1049
+ new_history = history + [
1050
+ {"role": "user", "content": f"๐ŸŽ™๏ธ {transcript}"},
1051
+ {"role": "assistant", "content": answer},
1052
+ ]
1053
+ return new_history, tts_save(answer, lang), transcript
1054
+
1055
+ # ============================================================
1056
+ # GRADIO UI
1057
+ # ============================================================
1058
+ with gr.Blocks(title="RAG + Sentiment + Forecast", theme=gr.themes.Soft()) as app:
1059
+ gr.Markdown(
1060
+ "# ๐Ÿค– Hybrid Multilingual RAG + Ensemble Sentiment + Economic Forecast\n"
1061
+ "**ENSSEA โ€” Master's Thesis | Si Tayeb Houari | 2025โ€“2026**"
1062
+ )
1063
 
1064
+ # โ”€โ”€ Tab 1: Upload โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
1065
  with gr.Tab("๐Ÿ“ 1 ยท Upload"):
1066
+ files = gr.File(
1067
+ label="๐Ÿ“‚ Upload Files (PDF / TXT / CSV / DOCX)",
1068
+ file_types=[".pdf",".txt",".csv",".docx"],
1069
+ file_count="multiple", type="filepath",
1070
+ )
1071
+ build_btn = gr.Button("๐Ÿ”จ Build Index", variant="primary")
1072
+ status = gr.Markdown("_No index built yet._")
1073
+ with gr.Row():
1074
+ save_btn = gr.Button("๐Ÿ’พ Save Index")
1075
+ load_btn = gr.Button("๐Ÿ”„ Load Saved Index")
1076
+ persist_status = gr.Markdown()
1077
+ sim_slider = gr.Slider(0.0, 1.0, value=0.10, step=0.05, label="๐ŸŽฏ Similarity Threshold")
1078
+ threshold_status = gr.Markdown()
1079
+ build_btn.click(build_index, inputs=files, outputs=status)
1080
+ save_btn.click(save_index, outputs=persist_status)
1081
+ load_btn.click(load_saved_index, outputs=persist_status)
1082
+ sim_slider.change(update_threshold, inputs=sim_slider, outputs=threshold_status)
1083
+
1084
+ # โ”€โ”€ Tab 2: Sentiment & Word Search โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1085
+ with gr.Tab("๐ŸŽญ 2 ยท Sentiment & Search"):
1086
+ inp = gr.Textbox(lines=3, label="๐Ÿ“ Enter text or keyword")
1087
+ run_btn = gr.Button("๐Ÿ” Analyze & Search", variant="primary")
1088
+ with gr.Row():
1089
+ out_sent = gr.Textbox(label="๐ŸŽญ Sentiment")
1090
+ out_conf = gr.Number(label="๐Ÿ“Š Score")
1091
+ out_full = gr.Markdown()
1092
+ rep_btn = gr.Button("๐Ÿ“„ Download Report")
1093
+ rep_file = gr.File(label="๐Ÿ“ฅ Report")
1094
  run_btn.click(predict_with_rag, inputs=inp, outputs=[out_sent, out_conf, out_full])
1095
+ rep_btn.click(generate_report, inputs=[inp, out_sent, out_conf, out_full], outputs=rep_file)
1096
 
1097
+ # โ”€โ”€ Tab 3: Smart Chatbot โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1098
+ with gr.Tab("๐Ÿ’ฌ 3 ยท Smart Chatbot"):
1099
+ chatbot = gr.Chatbot(height=430, type="messages", show_label=False)
1100
+ msg = gr.Textbox(placeholder="Ask anything about your documentsโ€ฆ", label="๐Ÿ’ฌ Message")
1101
+ with gr.Row():
1102
+ send_btn = gr.Button("๐Ÿ“จ Send", variant="primary")
1103
+ clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear")
1104
+ exp_btn = gr.Button("๐Ÿ“ฅ Export")
1105
+ exp_file = gr.File(label="๐Ÿ’พ Chat Export")
1106
+ msg.submit(chat_text, inputs=[msg, chatbot], outputs=[msg, chatbot])
1107
+ send_btn.click(chat_text, inputs=[msg, chatbot], outputs=[msg, chatbot])
1108
+ clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
1109
+ exp_btn.click(export_chat, inputs=chatbot, outputs=exp_file)
1110
+
1111
+ # โ”€โ”€ Tab 4: Voice โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1112
+ with gr.Tab("๐ŸŽ™๏ธ 4 ยท Voice"):
1113
+ gr.Markdown("### ๐ŸŽ™๏ธ Speak your question โ€” get a spoken answer")
1114
+ voice_input = gr.Audio(sources=["microphone"], type="numpy", label="๐ŸŽค Record")
1115
+ voice_btn = gr.Button("๐ŸŽ™๏ธ Ask by Voice", variant="primary")
1116
+ voice_chat = gr.Chatbot(height=300, type="messages")
1117
+ audio_output = gr.Audio(label="๐Ÿ”Š Answer", autoplay=True)
1118
+ transcript_out= gr.Textbox(label="๐Ÿ“ Transcript")
1119
+ voice_btn.click(chat_voice, inputs=[voice_input, voice_chat],
1120
+ outputs=[voice_chat, audio_output, transcript_out])
1121
+
1122
+ # โ”€โ”€ Tab 5: Analytics โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1123
+ with gr.Tab("๐Ÿ“Š 5 ยท Analytics"):
1124
+ stats_btn = gr.Button("๐Ÿ“Š Refresh Stats")
1125
+ stats_out = gr.Markdown()
1126
+ kw_btn = gr.Button("๐Ÿ”‘ Top Keywords")
1127
+ kw_out = gr.Markdown()
1128
+ stats_btn.click(get_stats, outputs=stats_out)
1129
+ kw_btn.click(get_top_keywords, outputs=kw_out)
1130
+
1131
+ # โ”€โ”€ Tab 6: About โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1132
+ with gr.Tab("โ„น๏ธ 6 ยท About"):
1133
+ gr.Markdown(
1134
+ "## ๐Ÿค– Hybrid Multilingual RAG Framework\n\n"
1135
+ "| Component | Details |\n|---|---|\n"
1136
+ "| ๐Ÿซ School | ENSSEA โ€” ร‰cole Nationale Supรฉrieure de Statistique et d'ร‰conomie Appliquรฉe |\n"
1137
+ "| ๐Ÿ‘ค Author | Si Tayeb Houari |\n"
1138
+ "| ๐Ÿ“… Year | 2025โ€“2026 |\n"
1139
+ "| ๐ŸŽ“ Degree | Master's โ€” Statistics & Foresight Economics |\n\n"
1140
+ "### ๐Ÿ”ง Models Used\n"
1141
+ "- ๐Ÿฆ **FinBERT** (ProsusAI) โ€” Financial sentiment (40%)\n"
1142
+ "- ๐ŸŒ **XLM-RoBERTa** (CardiffNLP) โ€” Multilingual sentiment (30%)\n"
1143
+ "- ๐Ÿ“– **Economic Lexicon** โ€” Domain-specific keywords (30%)\n"
1144
+ "- ๐Ÿ” **MiniLM-L12** โ€” Multilingual embeddings (FAISS)\n"
1145
+ "- ๐Ÿ“Š **ms-marco-MiniLM** โ€” Cross-encoder reranking\n"
1146
+ "- ๐Ÿ—ฃ๏ธ **Whisper-small** โ€” ASR\n"
1147
+ "- ๐Ÿค– **Llama-3.3-70B** via Groq โ€” Response generation\n\n"
1148
+ "### ๐Ÿ“Š Forecasting\n"
1149
+ "- Baseline: **ARIMA(1,1,1)**\n"
1150
+ "- Enhanced: **SARIMAX + Ensemble Sentiment** (n_test = 3)\n"
1151
+ "- Tests: **ADF**, **Granger Causality**, **Diebold-Mariano**\n"
1152
+ "- Data: **World Bank API**\n"
1153
+ )
1154
 
1155
+ # โ”€โ”€ Tab 7: Economic Forecast โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1156
  with gr.Tab("๐Ÿ“ˆ 7 ยท Forecast"):
1157
+ gr.Markdown(
1158
+ "## ๐Ÿ“ˆ Economic Forecast โ€” ARIMA vs SARIMAX + Ensemble Sentiment\n"
1159
+ "> **n_test = 3** โ€” Evaluates on the last 3 years (captures recent economic turbulence)"
1160
+ )
1161
  with gr.Row():
1162
+ country_input = gr.Textbox(
1163
+ value="DZ", label="๐ŸŒ Country Code (ISO)",
1164
+ placeholder="e.g. DZ, MA, TN, EG, US",
1165
+ )
1166
+ target_input = gr.Dropdown(
1167
+ choices=[
1168
+ "Inflation (CPI %)",
1169
+ "GDP Growth (%) ",
1170
+ "Unemployment (%) ",
1171
+ "Exchange Rate",
1172
+ ],
1173
+ value="Inflation (CPI %)",
1174
+ label="๐ŸŽฏ Target Variable",
1175
+ )
1176
  with gr.Row():
1177
+ start_year = gr.Slider(
1178
+ minimum=1990, maximum=2020, value=2000, step=1, label="๐Ÿ“… Start Year"
1179
+ )
1180
+ end_year = gr.Slider(
1181
+ minimum=2010, maximum=2024, value=2023, step=1, label="๐Ÿ“… End Year"
1182
+ )
1183
+ forecast_btn = gr.Button("๐Ÿ“ˆ Run Forecast", variant="primary", size="lg")
1184
+ forecast_result = gr.Markdown()
1185
+ forecast_plot = gr.Image(label="๐Ÿ“Š Forecast Chart", type="filepath")
1186
+ forecast_btn.click(
1187
+ run_economic_forecast,
1188
+ inputs=[country_input, target_input, start_year, end_year],
1189
+ outputs=[forecast_result, forecast_plot],
1190
+ )
1191
 
1192
  app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)