Spaces:
Running
Update app.py
Browse files# ============================================================
# IMPORTS
# ============================================================
import re
import os
import math
import pickle
import requests
from collections import Counter
import numpy as np
import pandas as pd
import faiss
import PyPDF2
import torch
import gradio as gr
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from sentence_transformers import SentenceTransformer, CrossEncoder
from langdetect import detect, DetectorFactory
from gtts import gTTS
from transformers import pipeline as hf_pipeline
from transformers import pipeline
from datetime import datetime
from groq import Groq
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
DetectorFactory.seed = 0
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
groq_client = Groq(api_key=GROQ_API_KEY)
KB_TEXTS = []
KB_META = []
FAISS_INDEX = None
KB_EMB = None
DOC_TYPE_INFO = {"type": "๐ General", "is_economic": False, "score": 0}
PER_FILE_INFO = {}
CHAT_STATS = {"questions": 0, "found": 0, "not_found": 0}
MIN_SIMILARITY = 0.10
PERSIST_DIR = "/tmp"
KB_TEXTS_PATH = f"{PERSIST_DIR}/kb_texts.pkl"
KB_META_PATH = f"{PERSIST_DIR}/kb_meta.pkl"
FAISS_PATH = f"{PERSIST_DIR}/faiss.index"
os.makedirs(PERSIST_DIR, exist_ok=True)
# ============================================================
# PERSIST
# ============================================================
def save_index():
if FAISS_INDEX is None or not KB_TEXTS:
return "โ ๏ธ No index to save."
try:
with open(KB_TEXTS_PATH, "wb") as f: pickle.dump(KB_TEXTS, f)
with open(KB_META_PATH, "wb") as f: pickle.dump(KB_META, f)
faiss.write_index(FAISS_INDEX, FAISS_PATH)
return f"๐พ Saved! {len(KB_TEXTS):,} chunks"
except Exception as e:
return f"โ Save error: {e}"
def load_saved_index():
global KB_TEXTS, KB_META, FAISS_INDEX, DOC_TYPE_INFO
try:
if not os.path.exists(FAISS_PATH):
return "_No saved index found._"
with open(KB_TEXTS_PATH, "rb") as f: KB_TEXTS = pickle.load(f)
with open(KB_META_PATH, "rb") as f: KB_META = pickle.load(f)
FAISS_INDEX = faiss.read_index(FAISS_PATH)
DOC_TYPE_INFO = detect_document_type(KB_TEXTS)
return f"โ
**Index loaded!** `{len(KB_TEXTS):,}` chunks\n๐ท๏ธ Type: **{DOC_TYPE_INFO['type']}**"
except Exception as e:
return f"โ Load error: {e}"
# ============================================================
# KEYWORDS & LEXICONS
# ============================================================
ECONOMIC_KEYWORDS = [
"gdp","inflation","monetary","fiscal","forecast","exchange rate",
"interest rate","unemployment","recession","growth rate","trade balance",
"budget deficit","central bank","economic outlook","imf","world bank",
"cpi","macro","revenue","expenditure","deficit","surplus","debt",
"croissance","taux","banque centrale","prรฉvision","รฉconomique","pib",
"ุงูุชุถุฎู
","ุงููุงุชุฌ ุงูู
ุญูู","ุงููู
ู ุงูุงูุชุตุงุฏู","ุงูุจูู ุงูู
ุฑูุฒู","ุณุนุฑ ุงูุตุฑู",
]
MEDICAL_KEYWORDS = ["patient","diagnosis","treatment","clinical","hospital","symptom","disease"]
LEGAL_KEYWORDS = ["article","law","contract","clause","jurisdiction","court","legal"]
ACADEMIC_KEYWORDS = ["abstract","methodology","hypothesis","conclusion","references","doi","journal"]
ECON_POSITIVE = [
"growth","recovery","surplus","improvement","stability","increase",
"expansion","acceleration","resilience","upturn","robust","favorable",
"strengthened","progress","rebound","optimistic","confidence","boom",
"prosper","thrive","advance","gain","rise","positive","upward",
"exceed","outperform","strong","healthy","dynamic","sustainable",
"croissance","reprise","amรฉlioration","stabilitรฉ","excรฉdent","hausse",
"ุชุนุงูู","ูู
ู","ุงุณุชูุฑุงุฑ","ูุงุฆุถ","ุชุญุณูู","ุงุฑุชูุงุน","ุชูุณุน","ุฅูุฌุงุจู",
]
ECON_NEGATIVE = [
"deficit","recession","inflation","decline","contraction","debt",
"crisis","deterioration","slowdown","downturn","unemployment","pressure",
"risk","vulnerability","shock","uncertainty","war","sanctions",
"drought","collapse","default","volatile","instability","weak",
"fragile","pessimistic","loss","shrink","fall","negative","downward",
"dรฉficit","rรฉcession","crise","ralentissement","chรดmage","incertitude",
"ุนุฌุฒ","ุชุถุฎู
","ุฑููุฏ","ุงููู
ุงุด","ุฃุฒู
ุฉ","ุชุฏููุฑ","ุจุทุงูุฉ","ุงูุฎูุงุถ",
"ุถุบุท","ู
ุฎุงุทุฑ","ุตุฏู
ุฉ","ุนุฏู
ุงุณุชูุฑุงุฑ","ูุดุงุดุฉ","ุฏููู",
]
ECON_TRIGGER = [
"deficit","risk","crisis","recession","shock","uncertainty",
"slowdown","pressure","vulnerable","weak","deteriorat","downturn",
"growth","recovery","improvement","surplus","stable","expansion",
"resilience","rebound","gdp","forecast","outlook","trade","fiscal",
"monetary","exchange","interest","budget","revenue","expenditure",
"ุงูุชุถุฎู
","ุงููุงุชุฌ","ุงููู
ู","ุงูุนุฌุฒ","ุงูู
ุฎุงุทุฑ","ุงูุชููุนุงุช",
"croissance","dรฉficit","rรฉcession","prรฉvision","taux","politique",
]
def economic_lexicon_score(text: str) -> float:
text_lower = text.lower()
pos = sum(1 for w in ECON_POSITIVE if w in text_lower)
neg = sum(1 for w in ECON_NEGATIVE if w in text_lower)
total = max(pos + neg, 1)
return round((pos - neg) / total, 4)
def detect_document_type(texts: list) -> dict:
if not texts:
return {"type":"๐ General","is_economic":False,"score":0,"confidence":0.0}
full_text = " ".join(texts[:30]).lower()
scores = {
"economic": sum(1 for kw in ECONOMIC_KEYWORDS if kw in full_text),
"medical" : sum(1 for kw in MEDICAL_KEYWORDS if kw in full_text),
"legal" : sum(1 for kw in LEGAL_KEYWORDS if kw in full_text),
"academic": sum(1 for kw in ACADEMIC_KEYWORDS if kw in full_text),
"general" : 1,
}
doc_type = max(scores, key=scores.get)
confidence = round(scores[doc_type] / max(sum(scores.values()), 1), 2)
icons = {
"economic":"๐ Economic","medical":"๐ฅ Medical",
"legal":"โ๏ธ Legal","academic":"๐ Academic","general":"๐ General",
}
return {
"type" : icons.get(doc_type, "๐ General"),
"raw_type" : doc_type,
"is_economic": doc_type == "economic" and scores["economic"] >= 3,
"score" : scores[doc_type],
"confidence" : confidence,
}
# ============================================================
# AI MODELS
# ============================================================
WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}
print("โณ Loading FinBERT...")
try:
finbert_pipe = pipeline(
"text-classification", model="ProsusAI/finbert",
tokenizer="ProsusAI/finbert", return_all_scores=True,
device=0 if torch.cuda.is_available() else -1,
)
FINBERT_OK = True
except Exception as e:
print(f"โ ๏ธ FinBERT: {e}"); finbert_pipe = None; FINBERT_OK = False
print("โณ Loading XLM-RoBERTa...")
try:
xlm_pipe = pipeline(
"text-classification",
model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
tokenizer="cardiffnlp/twitter-xlm-roberta-base-sentiment",
return_all_scores=True,
device=0 if torch.cuda.is_available() else -1,
)
XLM_OK = True
except Exception as e:
print(f"โ ๏ธ XLM: {e}"); xlm_pipe = None; XLM_OK = False
def normalize_clf(raw):
if isinstance(raw, list) and raw and isinstance(raw[0], list): raw = raw[0]
return raw if isinstance(raw, list) else [raw]
def clf_finbert(text: str) -> float:
if not FINBERT_OK or finbert_pipe is None: return 0.0
try:
items = normalize_clf(finbert_pipe(text[:512]))
d = {r["label"].lower(): float(r["score"]) for r in items}
return round(d.get("positive", 0.0) - d.get("negative", 0.0), 4)
except: return 0.0
def clf_xlm(text: str) -> float:
if not XLM_OK or xlm_pipe is None: return 0.0
try:
items = normalize_clf(xlm_pipe(text[:512]))
d = {r["label"]: float(r["score"]) for r in items}
pos = d.get("LABEL_2", d.get("positive", d.get("Positive", 0.0)))
neg = d.get("LABEL_0", d.get("negative", d.get("Negative", 0.0)))
return round(pos - neg, 4)
except: return 0.0
def sentiment_score_numeric(text: str) -> float:
fb = clf_finbert(text)
xlm = clf_xlm(text)
lex = economic_lexicon_score(text)
return round(WEIGHTS["finbert"]*fb + WEIGHTS["xlm"]*xlm + WEIGHTS["lexicon"]*lex, 4)
def run_sentiment(text: str):
score = sentiment_score_numeric(text)
if score > 0.05: sent = "Positive ๐"
elif score < -0.05: sent = "Negative ๐"
else: sent = "Neutral ๐"
return sent, round(min(abs(score), 1.0), 4)
def run_sentiment_detailed(text: str) -> str:
fb = clf_finbert(text)
xlm = clf_xlm(text)
lex = economic_lexicon_score(text)
final = sentiment_score_numeric(text)
def bar(s):
filled = max(0, min(10, round((s + 1) / 2 * 10)))
icon = "๐ฉ" if s > 0.05 else "๐ฅ" if s < -0.05 else "๐จ"
return icon * filled + "โฌ" * (10 - filled)
label = "๐ข **Positive**" if final > 0.05 else "๐ด **Negative**" if final < -0.05 else "๐ก **Neutral**"
return (
f"### ๐ Ensemble Sentiment Breakdown\n\n"
f"| Model | Score | Bar | Weight |\n|---|---|---|---|\n"
f"| ๐ฆ FinBERT | `{fb:+.4f}` | {bar(fb)} | **40%** |\n"
f"| ๐ XLM-RoBERTa | `{xlm:+.4f}` | {bar(xlm)} | **30%** |\n"
f"| ๐ Lexicon | `{lex:+.4f}` | {bar(lex)} | **30%** |\n"
f"| โก **Final** | **`{final:+.4f}`** | {bar(final)} | **100%** |\n\n"
f"{label}"
)
print("โณ Loading Embedder + Reranker + ASR...")
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
asr = hf_pipeline(
"automatic-speech-recognition", model="openai/whisper-small",
|
@@ -34,7 +34,7 @@ DetectorFactory.seed = 0
|
|
| 34 |
# ============================================================
|
| 35 |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
|
| 36 |
groq_client = Groq(api_key=GROQ_API_KEY)
|
| 37 |
-
print(f"DEBUG โ Groq Key: {bool(GROQ_API_KEY)}")
|
| 38 |
|
| 39 |
# ============================================================
|
| 40 |
# GLOBAL STATE
|
|
@@ -54,6 +54,9 @@ KB_META_PATH = f"{PERSIST_DIR}/kb_meta.pkl"
|
|
| 54 |
FAISS_PATH = f"{PERSIST_DIR}/faiss.index"
|
| 55 |
os.makedirs(PERSIST_DIR, exist_ok=True)
|
| 56 |
|
|
|
|
|
|
|
|
|
|
| 57 |
def save_index():
|
| 58 |
if FAISS_INDEX is None or not KB_TEXTS:
|
| 59 |
return "โ ๏ธ No index to save."
|
|
@@ -150,7 +153,7 @@ def detect_document_type(texts: list) -> dict:
|
|
| 150 |
}
|
| 151 |
doc_type = max(scores, key=scores.get)
|
| 152 |
confidence = round(scores[doc_type] / max(sum(scores.values()), 1), 2)
|
| 153 |
-
icons
|
| 154 |
"economic":"๐ Economic","medical":"๐ฅ Medical",
|
| 155 |
"legal":"โ๏ธ Legal","academic":"๐ Academic","general":"๐ General",
|
| 156 |
}
|
|
@@ -163,11 +166,11 @@ def detect_document_type(texts: list) -> dict:
|
|
| 163 |
}
|
| 164 |
|
| 165 |
# ============================================================
|
| 166 |
-
# AI MODELS
|
| 167 |
# ============================================================
|
| 168 |
WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}
|
| 169 |
|
| 170 |
-
print("โณ Loading FinBERT
|
| 171 |
try:
|
| 172 |
finbert_pipe = pipeline(
|
| 173 |
"text-classification",
|
|
@@ -230,9 +233,9 @@ def sentiment_score_numeric(text: str) -> float:
|
|
| 230 |
|
| 231 |
def run_sentiment(text: str):
|
| 232 |
score = sentiment_score_numeric(text)
|
| 233 |
-
if score > 0.05:
|
| 234 |
elif score < -0.05: sent = "Negative ๐"
|
| 235 |
-
else:
|
| 236 |
return sent, round(min(abs(score), 1.0), 4)
|
| 237 |
|
| 238 |
def run_sentiment_detailed(text: str) -> str:
|
|
@@ -248,37 +251,50 @@ def run_sentiment_detailed(text: str) -> str:
|
|
| 248 |
return (
|
| 249 |
f"### ๐ Ensemble Sentiment Breakdown\n\n"
|
| 250 |
f"| Model | Score | Bar | Weight |\n|---|---|---|---|\n"
|
| 251 |
-
f"| ๐ฆ FinBERT
|
| 252 |
-
f"| ๐ XLM-RoBERTa
|
| 253 |
-
f"| ๐ Lexicon
|
| 254 |
-
f"| โก **Final**
|
| 255 |
f"{label}"
|
| 256 |
)
|
| 257 |
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
| 259 |
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
| 260 |
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
|
| 261 |
-
asr
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
_ = embedder.encode(["warmup"], convert_to_numpy=True)
|
| 263 |
print("โ
All models loaded!")
|
| 264 |
|
| 265 |
_startup = load_saved_index()
|
| 266 |
-
print(f"๐ Startup: {_startup}")
|
| 267 |
|
| 268 |
# ============================================================
|
| 269 |
# RAG CORE
|
| 270 |
# ============================================================
|
| 271 |
-
def clean_filename(path: str) -> str:
|
|
|
|
|
|
|
| 272 |
def detect_lang(text: str) -> str:
|
| 273 |
-
try:
|
| 274 |
-
|
|
|
|
|
|
|
| 275 |
|
| 276 |
def extract_year_from_filename(filename: str):
|
| 277 |
full_path = str(filename).replace("\\", "/")
|
| 278 |
for part in reversed(full_path.split("/")):
|
| 279 |
m = re.findall(r"\b(20\d{2}|19\d{2})\b", part)
|
| 280 |
if m: return int(m[0])
|
| 281 |
-
for pat in [r'WEO[_\-\s]?(\d{4})', r'BOA[_\-\s]?(\d{4})',
|
|
|
|
|
|
|
| 282 |
m = re.search(pat, full_path, re.IGNORECASE)
|
| 283 |
if m: return int(m.group(1))
|
| 284 |
all_y = re.findall(r'\b(19\d{2}|20\d{2})\b', full_path)
|
|
@@ -324,12 +340,15 @@ def load_file(path):
|
|
| 324 |
from docx import Document
|
| 325 |
doc = Document(path)
|
| 326 |
pars = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 327 |
-
return [{"text": "\n".join(pars[i:i+50]), "page": i//50+1}
|
| 328 |
-
|
|
|
|
|
|
|
| 329 |
if path.endswith(".csv"):
|
| 330 |
df = pd.read_csv(path)
|
| 331 |
col = "text" if "text" in df.columns else df.columns[0]
|
| 332 |
-
return [{"text": t, "page": i+1}
|
|
|
|
| 333 |
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 334 |
return [{"text": f.read(), "page": 1}]
|
| 335 |
|
|
@@ -340,27 +359,32 @@ def build_index(files):
|
|
| 340 |
file_paths = []
|
| 341 |
if not isinstance(files, list): files = [files]
|
| 342 |
for f in files:
|
| 343 |
-
if isinstance(f, str):
|
| 344 |
-
elif isinstance(f, dict):
|
| 345 |
-
elif hasattr(f, "name"):
|
| 346 |
-
else:
|
| 347 |
|
| 348 |
for p in file_paths:
|
| 349 |
full_path = str(p)
|
| 350 |
fname = clean_filename(full_path)
|
| 351 |
year = extract_year_from_filename(fname) or extract_year_from_filename(full_path)
|
| 352 |
-
pages
|
|
|
|
| 353 |
for pg in pages:
|
| 354 |
for ch in chunk_text(pg["text"]):
|
| 355 |
KB_TEXTS.append(ch)
|
| 356 |
-
KB_META.append({"name": fname, "lang": detect_lang(ch),
|
|
|
|
| 357 |
file_texts.append(ch)
|
| 358 |
ti = detect_document_type(file_texts)
|
| 359 |
ti["year"] = year
|
| 360 |
PER_FILE_INFO[fname] = ti
|
| 361 |
|
| 362 |
if not KB_TEXTS: raise gr.Error("โ ๏ธ No text extracted.")
|
| 363 |
-
KB_EMB = embedder.encode(
|
|
|
|
|
|
|
|
|
|
| 364 |
FAISS_INDEX = faiss.IndexFlatIP(KB_EMB.shape[1])
|
| 365 |
FAISS_INDEX.add(KB_EMB)
|
| 366 |
DOC_TYPE_INFO = detect_document_type(KB_TEXTS)
|
|
@@ -375,20 +399,33 @@ def build_index(files):
|
|
| 375 |
tbl += f"| `{fname}` | {yrb} | {info['type']}{badge} | {info['confidence']:.0%} | {n} |\n"
|
| 376 |
|
| 377 |
ef = [f for f,i in PER_FILE_INFO.items() if i["is_economic"]]
|
| 378 |
-
fmsg = (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
save_index()
|
| 380 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
def bm25_score(query_terms, doc, k1=1.5, b=0.75, avg_dl=200):
|
| 383 |
try:
|
| 384 |
if not KB_TEXTS or not isinstance(doc, str): return 0.0
|
| 385 |
-
dl, score
|
|
|
|
| 386 |
for term in query_terms:
|
| 387 |
if not isinstance(term, str) or not term: continue
|
| 388 |
-
tl
|
| 389 |
n_doc = sum(1 for t in KB_TEXTS if isinstance(t,str) and tl in t.lower())
|
| 390 |
-
tf
|
| 391 |
-
idf
|
| 392 |
score += idf*(tf*(k1+1))/(tf+k1*(1-b+b*dl/max(avg_dl,1)))
|
| 393 |
return score
|
| 394 |
except: return 0.0
|
|
@@ -396,24 +433,28 @@ def bm25_score(query_terms, doc, k1=1.5, b=0.75, avg_dl=200):
|
|
| 396 |
def rag_retrieve(query, k=5, top_n=3):
|
| 397 |
if FAISS_INDEX is None or not KB_TEXTS: return []
|
| 398 |
try:
|
| 399 |
-
q_emb = embedder.encode(
|
|
|
|
|
|
|
| 400 |
scores, idx = FAISS_INDEX.search(q_emb, min(k*3, len(KB_TEXTS)))
|
| 401 |
candidates, qterms = [], [t for t in re.findall(r"\w+", str(query).lower()) if t]
|
| 402 |
for rank, i in enumerate(idx[0]):
|
| 403 |
if i == -1: continue
|
| 404 |
-
sem
|
| 405 |
if sem < MIN_SIMILARITY: continue
|
| 406 |
text = KB_TEXTS[i]
|
| 407 |
if not isinstance(text, str): continue
|
| 408 |
kw = bm25_score(qterms, text)
|
| 409 |
lterms = [t for t in qterms if len(t) > 2]
|
| 410 |
-
try:
|
|
|
|
| 411 |
except: exact = False
|
| 412 |
hybrid = sem*0.6 + min(kw/10, 0.4) + (0.15 if exact else 0.0)
|
| 413 |
candidates.append({
|
| 414 |
"idx": i, "sem": sem, "kw": kw, "exact": exact, "hybrid": hybrid,
|
| 415 |
"lang": KB_META[i]["lang"], "file": KB_META[i]["name"],
|
| 416 |
-
"page": KB_META[i]["page"], "year": KB_META[i].get("year"),
|
|
|
|
| 417 |
})
|
| 418 |
if not candidates: return []
|
| 419 |
ce_scores = reranker.predict([[query, c["text"]] for c in candidates])
|
|
@@ -428,7 +469,7 @@ def rag_retrieve(query, k=5, top_n=3):
|
|
| 428 |
return []
|
| 429 |
|
| 430 |
def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
|
| 431 |
-
n
|
| 432 |
econ = [t for t in texts if any(kw in t.lower() for kw in ECON_TRIGGER)]
|
| 433 |
if len(econ) < 10:
|
| 434 |
start = texts[:min(10, n)]
|
|
@@ -438,7 +479,8 @@ def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
|
|
| 438 |
if len(econ) > max_chunks:
|
| 439 |
step = max(1, len(econ) // max_chunks)
|
| 440 |
sample = econ[::step][:max_chunks]
|
| 441 |
-
else:
|
|
|
|
| 442 |
return sample
|
| 443 |
|
| 444 |
def llm_groq(question, rag_context, history, lang):
|
|
@@ -451,24 +493,32 @@ def llm_groq(question, rag_context, history, lang):
|
|
| 451 |
"- Be concise, helpful, accurate."
|
| 452 |
)
|
| 453 |
messages = [{"role": "system", "content": system_prompt}]
|
| 454 |
-
for turn in history[-4:]:
|
|
|
|
| 455 |
user_content = f"๐ Context:\n{rag_context}\n\nQuestion: {question}" if rag_context else question
|
| 456 |
messages.append({"role": "user", "content": user_content})
|
| 457 |
try:
|
| 458 |
-
r = groq_client.chat.completions.create(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
return r.choices[0].message.content.strip()
|
| 460 |
-
except Exception as e:
|
|
|
|
| 461 |
|
| 462 |
def smart_answer(question, history):
|
| 463 |
-
lang
|
| 464 |
results = rag_retrieve(question, k=5, top_n=3)
|
| 465 |
rag_context = ""
|
| 466 |
if results:
|
| 467 |
-
for r in results:
|
|
|
|
| 468 |
has_good_rag = bool(results) and results[0]["sem"] >= 0.25
|
| 469 |
-
answer_text
|
| 470 |
if has_good_rag:
|
| 471 |
-
src
|
| 472 |
badge = f"\n\n๐ **{'ุงูู
ุตุฏุฑ' if lang=='ar' else 'Source'}:** {src}"
|
| 473 |
CHAT_STATS["found"] += 1
|
| 474 |
else:
|
|
@@ -480,7 +530,8 @@ def smart_answer(question, history):
|
|
| 480 |
def predict_with_rag(text):
|
| 481 |
text = "" if text is None else str(text).strip()
|
| 482 |
if not text: raise gr.Error("โ ๏ธ Enter text first.")
|
| 483 |
-
lang
|
|
|
|
| 484 |
exact_hits = []
|
| 485 |
for i, chunk in enumerate(KB_TEXTS):
|
| 486 |
if not isinstance(chunk, str): continue
|
|
@@ -490,8 +541,13 @@ def predict_with_rag(text):
|
|
| 490 |
if re.search(rf"\b{re.escape(term)}\b", cl):
|
| 491 |
for s in re.split(r"(?<=[.!?ุ\n])\s+", chunk):
|
| 492 |
if re.search(rf"\b{re.escape(term)}\b", s.lower()):
|
| 493 |
-
exact_hits.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
except: continue
|
|
|
|
| 495 |
sem_results, md = rag_retrieve(text, k=5, top_n=3), ""
|
| 496 |
if exact_hits:
|
| 497 |
seen, unique = set(), []
|
|
@@ -510,33 +566,49 @@ def predict_with_rag(text):
|
|
| 510 |
k2 = (h["file"], h["chunk_id"])
|
| 511 |
if k2 in seen2: continue
|
| 512 |
seen2.add(k2)
|
| 513 |
-
md
|
| 514 |
else:
|
| 515 |
sent, conf = "โ Not found", 0.0
|
| 516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
if sem_results:
|
| 518 |
md += "---\n## ๐ Semantic Results\n\n"
|
| 519 |
for r in sem_results:
|
| 520 |
-
bar
|
| 521 |
snippet = r["text"][:300].strip()
|
| 522 |
for t in qterms:
|
| 523 |
try: snippet = re.sub(rf"(?i)({re.escape(t)})", r"**\1**", snippet)
|
| 524 |
except: pass
|
| 525 |
-
md +=
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
return sent, round(conf, 4), md
|
| 528 |
|
| 529 |
# ============================================================
|
| 530 |
-
#
|
| 531 |
# ============================================================
|
| 532 |
def get_worldbank_data(country_code, indicator, start_year, end_year):
|
| 533 |
-
url =
|
|
|
|
|
|
|
|
|
|
| 534 |
try:
|
| 535 |
resp = requests.get(url, timeout=15)
|
| 536 |
resp.raise_for_status()
|
| 537 |
data = resp.json()
|
| 538 |
if not data or len(data) < 2 or not data[1]: return pd.DataFrame()
|
| 539 |
-
rows = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
return pd.DataFrame(rows).dropna().sort_values("year").reset_index(drop=True)
|
| 541 |
except Exception as e:
|
| 542 |
print(f"World Bank error: {e}")
|
|
@@ -553,16 +625,26 @@ def build_doc_sentiment_index():
|
|
| 553 |
sample = get_economic_chunks(texts, max_chunks=40)
|
| 554 |
scores = [sentiment_score_numeric(t) for t in sample]
|
| 555 |
avg = round(float(np.mean(scores)), 4)
|
| 556 |
-
year = next(
|
|
|
|
|
|
|
| 557 |
file_results.append({
|
| 558 |
-
"file": fname, "year": year if year else "N/A",
|
| 559 |
-
"
|
|
|
|
| 560 |
})
|
| 561 |
-
if year:
|
|
|
|
| 562 |
|
| 563 |
-
yearly_avg = {
|
|
|
|
|
|
|
|
|
|
| 564 |
df_files = pd.DataFrame(file_results).sort_values("year")
|
| 565 |
-
df_yearly =
|
|
|
|
|
|
|
|
|
|
| 566 |
return df_files, df_yearly
|
| 567 |
|
| 568 |
def run_adf_check(series: np.ndarray, name: str):
|
|
@@ -570,62 +652,79 @@ def run_adf_check(series: np.ndarray, name: str):
|
|
| 570 |
def adf_p(s):
|
| 571 |
try: return adfuller(s, autolag='AIC')[1]
|
| 572 |
except: return 1.0
|
| 573 |
-
|
| 574 |
s = series.copy()
|
| 575 |
p0 = adf_p(s)
|
| 576 |
-
if p0 <= 0.05:
|
| 577 |
-
|
| 578 |
s1 = np.diff(s)
|
| 579 |
p1 = adf_p(s1)
|
| 580 |
-
if p1 <= 0.05:
|
| 581 |
-
|
| 582 |
s2 = np.diff(s1)
|
| 583 |
p2 = adf_p(s2)
|
| 584 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
|
| 586 |
def run_granger_test(series_y, series_exog, maxlag=4):
|
| 587 |
try:
|
| 588 |
from statsmodels.tsa.stattools import grangercausalitytests
|
| 589 |
-
if len(series_y) < 10:
|
|
|
|
| 590 |
sy, status_y = run_adf_check(series_y.copy(), "Target")[:2]
|
| 591 |
sexog, status_exog = run_adf_check(series_exog.copy(), "Sentiment")[:2]
|
| 592 |
-
|
| 593 |
min_len = min(len(sy), len(sexog))
|
| 594 |
sy, sexog = sy[-min_len:], sexog[-min_len:]
|
| 595 |
maxlag = min(maxlag, max(1, (len(sy) - 1) // 3))
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
|
|
|
| 600 |
rows, any_pass, best_p = [], False, 1.0
|
| 601 |
for lag, res in gc_result.items():
|
| 602 |
p_val = res[0]["ssr_ftest"][1]
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
|
|
|
| 606 |
best_p = min(best_p, p_val)
|
| 607 |
-
rows.append(f"| {lag} | {
|
| 608 |
|
| 609 |
table = (
|
| 610 |
"### ๐ฌ Granger Causality Test\n"
|
| 611 |
"*Hโ: Sentiment does NOT Granger-cause Target*\n\n"
|
| 612 |
-
f"
|
| 613 |
-
"|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
)
|
| 615 |
-
if any_pass:
|
| 616 |
-
|
| 617 |
-
|
|
|
|
|
|
|
|
|
|
| 618 |
return table + verdict, any_pass
|
| 619 |
-
except Exception as e:
|
|
|
|
| 620 |
|
| 621 |
def run_dm_test(actual, pred_arima, pred_sarimax):
|
| 622 |
try:
|
| 623 |
n = len(actual)
|
| 624 |
-
if n < 3:
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
|
|
|
|
|
|
| 629 |
dm_stat = d_mean / (d_std / np.sqrt(n))
|
| 630 |
p_val = 2 * (1 - stats.t.cdf(abs(dm_stat), df=n - 1))
|
| 631 |
sig = "โ
Yes" if p_val < 0.05 else ("๐ถ Marginal" if p_val < 0.10 else "โ No")
|
|
@@ -633,23 +732,36 @@ def run_dm_test(actual, pred_arima, pred_sarimax):
|
|
| 633 |
|
| 634 |
table = (
|
| 635 |
"### ๐ฏ Diebold-Mariano Test\n"
|
|
|
|
| 636 |
"| DM Statistic | p-value | n (test) | Significant? | Better Model |\n"
|
| 637 |
"|-------------|---------|----------|-------------|-------------|\n"
|
| 638 |
f"| `{dm_stat:.4f}` | `{p_val:.4f}` | `{n}` | {sig} | **{better}** |\n"
|
| 639 |
)
|
| 640 |
passed = p_val < 0.05 and dm_stat > 0
|
| 641 |
-
if passed:
|
| 642 |
-
|
| 643 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
return table + verdict, passed
|
| 645 |
-
except Exception as e:
|
|
|
|
| 646 |
|
|
|
|
|
|
|
|
|
|
| 647 |
def run_economic_forecast(country_code, target_var, start_year, end_year):
|
| 648 |
try:
|
| 649 |
from statsmodels.tsa.arima.model import ARIMA
|
| 650 |
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
| 651 |
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
| 652 |
-
except ImportError:
|
|
|
|
| 653 |
|
| 654 |
indicator_map = {
|
| 655 |
"Inflation (CPI %)" : "FP.CPI.TOTL.ZG",
|
|
@@ -657,190 +769,424 @@ def run_economic_forecast(country_code, target_var, start_year, end_year):
|
|
| 657 |
"Unemployment (%) ": "SL.UEM.TOTL.ZS",
|
| 658 |
"Exchange Rate" : "PA.NUS.FCRF",
|
| 659 |
}
|
| 660 |
-
econ_df = get_worldbank_data(
|
| 661 |
-
|
| 662 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 663 |
|
| 664 |
df_files, df_yearly = build_doc_sentiment_index()
|
| 665 |
|
| 666 |
if df_yearly is not None and len(df_yearly) >= 2:
|
| 667 |
-
merged
|
| 668 |
-
merged["sentiment"] = merged["sentiment"].fillna(
|
| 669 |
-
|
|
|
|
|
|
|
|
|
|
| 670 |
else:
|
| 671 |
-
global_sent =
|
|
|
|
|
|
|
|
|
|
| 672 |
merged = econ_df.copy()
|
| 673 |
merged["sentiment"] = global_sent
|
| 674 |
-
has_yearly
|
|
|
|
| 675 |
|
| 676 |
if merged["sentiment"].std() > 1e-6:
|
| 677 |
scaler = MinMaxScaler(feature_range=(-0.3, 0.3))
|
| 678 |
-
merged["sentiment"] = scaler.fit_transform(
|
|
|
|
|
|
|
| 679 |
|
| 680 |
series = merged["value"].values.astype(float)
|
| 681 |
exog = merged["sentiment"].values.reshape(-1, 1)
|
| 682 |
years = merged["year"].values
|
| 683 |
n = len(series)
|
| 684 |
|
| 685 |
-
# ==========================================================
|
| 686 |
-
# โ
|
| 687 |
-
# ==========================================================
|
| 688 |
split = n - 3
|
| 689 |
-
if split < 5:
|
| 690 |
-
split = max(int(n * 0.75), 5)
|
| 691 |
|
| 692 |
-
train_y,
|
| 693 |
train_exog, test_exog = exog[:split], exog[split:]
|
| 694 |
test_years = years[split:]
|
| 695 |
|
|
|
|
| 696 |
try:
|
| 697 |
-
m1
|
| 698 |
pred_arima = m1.forecast(len(test_y))
|
| 699 |
rmse_a = float(np.sqrt(mean_squared_error(test_y, pred_arima)))
|
| 700 |
mae_a = float(mean_absolute_error(test_y, pred_arima))
|
| 701 |
mape_a = float(np.mean(np.abs((test_y-pred_arima)/np.maximum(np.abs(test_y),1e-8)))*100)
|
| 702 |
-
except Exception as e:
|
|
|
|
| 703 |
|
|
|
|
| 704 |
try:
|
| 705 |
-
m2
|
| 706 |
pred_sarimax = m2.forecast(len(test_y), exog=test_exog)
|
| 707 |
rmse_s = float(np.sqrt(mean_squared_error(test_y, pred_sarimax)))
|
| 708 |
mae_s = float(mean_absolute_error(test_y, pred_sarimax))
|
| 709 |
mape_s = float(np.mean(np.abs((test_y-pred_sarimax)/np.maximum(np.abs(test_y),1e-8)))*100)
|
| 710 |
-
except Exception as e:
|
|
|
|
| 711 |
|
| 712 |
impr_rmse = (rmse_a - rmse_s) / rmse_a * 100
|
| 713 |
impr_mae = (mae_a - mae_s) / mae_a * 100
|
| 714 |
impr_mape = (mape_a - mape_s) / mape_a * 100
|
| 715 |
|
|
|
|
| 716 |
if has_yearly and df_yearly is not None and len(df_yearly) >= 5:
|
| 717 |
real_merged = econ_df.merge(df_yearly, on="year", how="inner")
|
| 718 |
-
gc_y
|
|
|
|
| 719 |
else:
|
| 720 |
-
gc_y
|
|
|
|
| 721 |
|
| 722 |
granger_md, granger_pass = run_granger_test(gc_y, gc_exog, maxlag=4)
|
| 723 |
dm_md, dm_pass = run_dm_test(test_y, np.array(pred_arima), np.array(pred_sarimax))
|
| 724 |
|
|
|
|
|
|
|
|
|
|
| 725 |
fig, axes = plt.subplots(4, 1, figsize=(11, 18))
|
|
|
|
|
|
|
| 726 |
axes[0].plot(years, series, "o-", color="#2196F3", label="Actual", lw=2, ms=5)
|
| 727 |
-
axes[0].plot(test_years, pred_arima,
|
| 728 |
axes[0].plot(test_years, pred_sarimax, "^-.", color="#4CAF50", label="SARIMAX+Ensemble", lw=2)
|
| 729 |
axes[0].axvline(x=years[split-1], color="gray", linestyle=":", alpha=0.7, label="TrainโTest")
|
| 730 |
-
axes[0].set_title(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
axes[0].legend(fontsize=9); axes[0].grid(True, alpha=0.3)
|
| 732 |
|
| 733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
axes[1].bar(years, merged["sentiment"], color=s_clrs, edgecolor="white", width=0.6)
|
| 735 |
axes[1].axhline(y=0, color="black", lw=0.8)
|
| 736 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
axes[1].grid(True, alpha=0.3, axis="y")
|
| 738 |
|
| 739 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
for bar, val in zip(bars, [rmse_a, rmse_s]):
|
| 741 |
-
axes[2].text(
|
| 742 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 743 |
|
|
|
|
| 744 |
axes[3].axis("off")
|
| 745 |
test_data = [
|
| 746 |
["Test", "Result", "Interpretation"],
|
| 747 |
-
[
|
| 748 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
]
|
| 750 |
-
tbl4 = axes[3].table(
|
|
|
|
|
|
|
|
|
|
| 751 |
tbl4.auto_set_font_size(False); tbl4.set_fontsize(11); tbl4.scale(1, 2.5)
|
| 752 |
for (row, col), cell in tbl4.get_celld().items():
|
| 753 |
-
if row == 0:
|
| 754 |
-
|
| 755 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
|
| 757 |
plt.tight_layout(pad=3.0)
|
| 758 |
img_path = "/tmp/forecast_plot.png"
|
| 759 |
-
plt.savefig(img_path, dpi=130, bbox_inches="tight")
|
|
|
|
| 760 |
|
|
|
|
|
|
|
|
|
|
| 761 |
sent_table = ""
|
| 762 |
if df_files is not None and len(df_files) > 0:
|
| 763 |
-
sent_table =
|
| 764 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 765 |
|
| 766 |
result_md = (
|
| 767 |
-
f"## ๐ Forecast
|
| 768 |
-
f"| | |\n|---|---|\n
|
| 769 |
-
f"
|
| 770 |
-
f"|
|
| 771 |
-
f"|
|
| 772 |
-
f"|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
f"---\n{granger_md}\n\n---\n{dm_md}\n{sent_table}"
|
| 774 |
)
|
| 775 |
return result_md, img_path
|
| 776 |
|
| 777 |
# ============================================================
|
| 778 |
-
#
|
| 779 |
# ============================================================
|
| 780 |
def generate_report(text, sent, conf, md):
|
| 781 |
path = "/tmp/report.md"
|
| 782 |
-
with open(path, "w", encoding="utf-8") as f:
|
|
|
|
| 783 |
return path
|
|
|
|
| 784 |
def export_chat(history):
|
| 785 |
path = "/tmp/chat.txt"
|
| 786 |
with open(path, "w", encoding="utf-8") as f:
|
| 787 |
-
for turn in history:
|
|
|
|
| 788 |
return path
|
| 789 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
def get_top_keywords():
|
| 791 |
-
if not KB_TEXTS: return "_No
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 795 |
def chat_text(message, history):
|
| 796 |
if not message.strip(): return "", history
|
| 797 |
answer, _ = smart_answer(message, history)
|
| 798 |
-
return "", history + [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 799 |
def tts_save(text, lang="en"):
|
| 800 |
path = "/tmp/ans.mp3"
|
| 801 |
-
gTTS(
|
|
|
|
|
|
|
|
|
|
| 802 |
return path
|
|
|
|
| 803 |
def chat_voice(audio, history):
|
| 804 |
-
if audio is None: raise gr.Error("No audio.")
|
| 805 |
sr, y = audio
|
| 806 |
y = np.array(y) if isinstance(y, list) else y
|
| 807 |
if y.ndim > 1: y = y.mean(axis=1)
|
| 808 |
transcript = asr({"array": y.astype(np.float32), "sampling_rate": sr})["text"]
|
| 809 |
-
lang
|
| 810 |
-
answer, _
|
| 811 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
|
| 813 |
-
|
| 814 |
-
gr.Markdown("# ๐ค Multilingual RAG + Ensemble Sentiment + Economic Forecast")
|
| 815 |
-
|
| 816 |
with gr.Tab("๐ 1 ยท Upload"):
|
| 817 |
-
files
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 827 |
run_btn.click(predict_with_rag, inputs=inp, outputs=[out_sent, out_conf, out_full])
|
|
|
|
| 828 |
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
msg.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 833 |
|
|
|
|
| 834 |
with gr.Tab("๐ 7 ยท Forecast"):
|
| 835 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 836 |
with gr.Row():
|
| 837 |
-
country_input = gr.Textbox(
|
| 838 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
with gr.Row():
|
| 840 |
-
start_year = gr.Slider(
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 845 |
|
| 846 |
app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
|
|
|
|
| 34 |
# ============================================================
|
| 35 |
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
|
| 36 |
groq_client = Groq(api_key=GROQ_API_KEY)
|
| 37 |
+
print(f"DEBUG โ Groq Key loaded: {bool(GROQ_API_KEY)}")
|
| 38 |
|
| 39 |
# ============================================================
|
| 40 |
# GLOBAL STATE
|
|
|
|
| 54 |
FAISS_PATH = f"{PERSIST_DIR}/faiss.index"
|
| 55 |
os.makedirs(PERSIST_DIR, exist_ok=True)
|
| 56 |
|
| 57 |
+
# ============================================================
|
| 58 |
+
# PERSIST
|
| 59 |
+
# ============================================================
|
| 60 |
def save_index():
|
| 61 |
if FAISS_INDEX is None or not KB_TEXTS:
|
| 62 |
return "โ ๏ธ No index to save."
|
|
|
|
| 153 |
}
|
| 154 |
doc_type = max(scores, key=scores.get)
|
| 155 |
confidence = round(scores[doc_type] / max(sum(scores.values()), 1), 2)
|
| 156 |
+
icons = {
|
| 157 |
"economic":"๐ Economic","medical":"๐ฅ Medical",
|
| 158 |
"legal":"โ๏ธ Legal","academic":"๐ Academic","general":"๐ General",
|
| 159 |
}
|
|
|
|
| 166 |
}
|
| 167 |
|
| 168 |
# ============================================================
|
| 169 |
+
# AI MODELS โ Ensemble: FinBERT 40% + XLM 30% + Lexicon 30%
|
| 170 |
# ============================================================
|
| 171 |
WEIGHTS = {"finbert": 0.40, "xlm": 0.30, "lexicon": 0.30}
|
| 172 |
|
| 173 |
+
print("โณ Loading FinBERT...")
|
| 174 |
try:
|
| 175 |
finbert_pipe = pipeline(
|
| 176 |
"text-classification",
|
|
|
|
| 233 |
|
| 234 |
def run_sentiment(text: str):
|
| 235 |
score = sentiment_score_numeric(text)
|
| 236 |
+
if score > 0.05: sent = "Positive ๐"
|
| 237 |
elif score < -0.05: sent = "Negative ๐"
|
| 238 |
+
else: sent = "Neutral ๐"
|
| 239 |
return sent, round(min(abs(score), 1.0), 4)
|
| 240 |
|
| 241 |
def run_sentiment_detailed(text: str) -> str:
|
|
|
|
| 251 |
return (
|
| 252 |
f"### ๐ Ensemble Sentiment Breakdown\n\n"
|
| 253 |
f"| Model | Score | Bar | Weight |\n|---|---|---|---|\n"
|
| 254 |
+
f"| ๐ฆ FinBERT | `{fb:+.4f}` | {bar(fb)} | **40%** |\n"
|
| 255 |
+
f"| ๐ XLM-RoBERTa | `{xlm:+.4f}` | {bar(xlm)} | **30%** |\n"
|
| 256 |
+
f"| ๐ Lexicon | `{lex:+.4f}` | {bar(lex)} | **30%** |\n"
|
| 257 |
+
f"| โก **Final** | **`{final:+.4f}`** | {bar(final)} | **100%** |\n\n"
|
| 258 |
f"{label}"
|
| 259 |
)
|
| 260 |
|
| 261 |
+
# ============================================================
|
| 262 |
+
# EMBEDDING + RERANKER + ASR
|
| 263 |
+
# ============================================================
|
| 264 |
+
print("โณ Loading Embedder, Reranker, ASR...")
|
| 265 |
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
| 266 |
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
|
| 267 |
+
asr = hf_pipeline(
|
| 268 |
+
"automatic-speech-recognition",
|
| 269 |
+
model="openai/whisper-small",
|
| 270 |
+
device=0 if torch.cuda.is_available() else -1,
|
| 271 |
+
)
|
| 272 |
_ = embedder.encode(["warmup"], convert_to_numpy=True)
|
| 273 |
print("โ
All models loaded!")
|
| 274 |
|
| 275 |
_startup = load_saved_index()
|
| 276 |
+
print(f"๐ Startup load: {_startup}")
|
| 277 |
|
| 278 |
# ============================================================
|
| 279 |
# RAG CORE
|
| 280 |
# ============================================================
|
| 281 |
+
def clean_filename(path: str) -> str:
|
| 282 |
+
return os.path.basename(str(path))
|
| 283 |
+
|
| 284 |
def detect_lang(text: str) -> str:
|
| 285 |
+
try:
|
| 286 |
+
return "ar" if str(detect(str(text)[:300])).startswith("ar") else "en"
|
| 287 |
+
except:
|
| 288 |
+
return "en"
|
| 289 |
|
| 290 |
def extract_year_from_filename(filename: str):
|
| 291 |
full_path = str(filename).replace("\\", "/")
|
| 292 |
for part in reversed(full_path.split("/")):
|
| 293 |
m = re.findall(r"\b(20\d{2}|19\d{2})\b", part)
|
| 294 |
if m: return int(m[0])
|
| 295 |
+
for pat in [r'WEO[_\-\s]?(\d{4})', r'BOA[_\-\s]?(\d{4})',
|
| 296 |
+
r'IMF[_\-\s]?(\d{4})', r'rapport[_\-\s]?(\d{4})',
|
| 297 |
+
r'report[_\-\s]?(\d{4})']:
|
| 298 |
m = re.search(pat, full_path, re.IGNORECASE)
|
| 299 |
if m: return int(m.group(1))
|
| 300 |
all_y = re.findall(r'\b(19\d{2}|20\d{2})\b', full_path)
|
|
|
|
| 340 |
from docx import Document
|
| 341 |
doc = Document(path)
|
| 342 |
pars = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 343 |
+
return [{"text": "\n".join(pars[i:i+50]), "page": i//50+1}
|
| 344 |
+
for i in range(0, len(pars), 50)] or [{"text":"Empty DOCX.","page":1}]
|
| 345 |
+
except Exception as e:
|
| 346 |
+
return [{"text": f"DOCX error: {e}", "page": 1}]
|
| 347 |
if path.endswith(".csv"):
|
| 348 |
df = pd.read_csv(path)
|
| 349 |
col = "text" if "text" in df.columns else df.columns[0]
|
| 350 |
+
return [{"text": t, "page": i+1}
|
| 351 |
+
for i, t in enumerate(df[col].dropna().astype(str))]
|
| 352 |
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 353 |
return [{"text": f.read(), "page": 1}]
|
| 354 |
|
|
|
|
| 359 |
file_paths = []
|
| 360 |
if not isinstance(files, list): files = [files]
|
| 361 |
for f in files:
|
| 362 |
+
if isinstance(f, str): file_paths.append(f)
|
| 363 |
+
elif isinstance(f, dict): file_paths.append(f.get("path") or f.get("name") or str(f))
|
| 364 |
+
elif hasattr(f, "name"): file_paths.append(f.name)
|
| 365 |
+
else: file_paths.append(str(f))
|
| 366 |
|
| 367 |
for p in file_paths:
|
| 368 |
full_path = str(p)
|
| 369 |
fname = clean_filename(full_path)
|
| 370 |
year = extract_year_from_filename(fname) or extract_year_from_filename(full_path)
|
| 371 |
+
pages = load_file(full_path)
|
| 372 |
+
file_texts = []
|
| 373 |
for pg in pages:
|
| 374 |
for ch in chunk_text(pg["text"]):
|
| 375 |
KB_TEXTS.append(ch)
|
| 376 |
+
KB_META.append({"name": fname, "lang": detect_lang(ch),
|
| 377 |
+
"page": pg["page"], "year": year})
|
| 378 |
file_texts.append(ch)
|
| 379 |
ti = detect_document_type(file_texts)
|
| 380 |
ti["year"] = year
|
| 381 |
PER_FILE_INFO[fname] = ti
|
| 382 |
|
| 383 |
if not KB_TEXTS: raise gr.Error("โ ๏ธ No text extracted.")
|
| 384 |
+
KB_EMB = embedder.encode(
|
| 385 |
+
KB_TEXTS, convert_to_numpy=True,
|
| 386 |
+
normalize_embeddings=True, show_progress_bar=False
|
| 387 |
+
).astype("float32")
|
| 388 |
FAISS_INDEX = faiss.IndexFlatIP(KB_EMB.shape[1])
|
| 389 |
FAISS_INDEX.add(KB_EMB)
|
| 390 |
DOC_TYPE_INFO = detect_document_type(KB_TEXTS)
|
|
|
|
| 399 |
tbl += f"| `{fname}` | {yrb} | {info['type']}{badge} | {info['confidence']:.0%} | {n} |\n"
|
| 400 |
|
| 401 |
ef = [f for f,i in PER_FILE_INFO.items() if i["is_economic"]]
|
| 402 |
+
fmsg = (
|
| 403 |
+
f"\n\n๐ข **Economic files detected:** " +
|
| 404 |
+
", ".join(f"`{f}`" for f in ef) +
|
| 405 |
+
"\nโก๏ธ Go to **๐ 7 ยท Forecast** tab to run predictions."
|
| 406 |
+
) if ef else ""
|
| 407 |
save_index()
|
| 408 |
+
return (
|
| 409 |
+
f"โ
**Index built!**\n\n"
|
| 410 |
+
f"| | |\n|---|---|\n"
|
| 411 |
+
f"| ๐ฆ Total chunks | **{len(KB_TEXTS):,}** |\n"
|
| 412 |
+
f"| ๐ Files | **{len(file_paths)}** |\n"
|
| 413 |
+
f"| ๐ธ๐ฆ Arabic | **{lang_count.get('ar',0):,}** |\n"
|
| 414 |
+
f"| ๐บ๐ธ English | **{lang_count.get('en',0):,}** |\n\n"
|
| 415 |
+
f"---\n### ๐ Per-File Analysis\n\n{tbl}{fmsg}"
|
| 416 |
+
)
|
| 417 |
|
| 418 |
def bm25_score(query_terms, doc, k1=1.5, b=0.75, avg_dl=200):
|
| 419 |
try:
|
| 420 |
if not KB_TEXTS or not isinstance(doc, str): return 0.0
|
| 421 |
+
dl, score = len(doc.split()), 0.0
|
| 422 |
+
df = Counter(doc.lower().split())
|
| 423 |
for term in query_terms:
|
| 424 |
if not isinstance(term, str) or not term: continue
|
| 425 |
+
tl = term.lower()
|
| 426 |
n_doc = sum(1 for t in KB_TEXTS if isinstance(t,str) and tl in t.lower())
|
| 427 |
+
tf = df.get(tl, 0)
|
| 428 |
+
idf = math.log((len(KB_TEXTS)+1)/(1+n_doc))
|
| 429 |
score += idf*(tf*(k1+1))/(tf+k1*(1-b+b*dl/max(avg_dl,1)))
|
| 430 |
return score
|
| 431 |
except: return 0.0
|
|
|
|
| 433 |
def rag_retrieve(query, k=5, top_n=3):
|
| 434 |
if FAISS_INDEX is None or not KB_TEXTS: return []
|
| 435 |
try:
|
| 436 |
+
q_emb = embedder.encode(
|
| 437 |
+
[query], convert_to_numpy=True, normalize_embeddings=True
|
| 438 |
+
).astype("float32")
|
| 439 |
scores, idx = FAISS_INDEX.search(q_emb, min(k*3, len(KB_TEXTS)))
|
| 440 |
candidates, qterms = [], [t for t in re.findall(r"\w+", str(query).lower()) if t]
|
| 441 |
for rank, i in enumerate(idx[0]):
|
| 442 |
if i == -1: continue
|
| 443 |
+
sem = float(scores[0][rank])
|
| 444 |
if sem < MIN_SIMILARITY: continue
|
| 445 |
text = KB_TEXTS[i]
|
| 446 |
if not isinstance(text, str): continue
|
| 447 |
kw = bm25_score(qterms, text)
|
| 448 |
lterms = [t for t in qterms if len(t) > 2]
|
| 449 |
+
try:
|
| 450 |
+
exact = all(re.search(rf"\b{re.escape(t)}\b", text.lower()) for t in lterms) if lterms else False
|
| 451 |
except: exact = False
|
| 452 |
hybrid = sem*0.6 + min(kw/10, 0.4) + (0.15 if exact else 0.0)
|
| 453 |
candidates.append({
|
| 454 |
"idx": i, "sem": sem, "kw": kw, "exact": exact, "hybrid": hybrid,
|
| 455 |
"lang": KB_META[i]["lang"], "file": KB_META[i]["name"],
|
| 456 |
+
"page": KB_META[i]["page"], "year": KB_META[i].get("year"),
|
| 457 |
+
"text": text,
|
| 458 |
})
|
| 459 |
if not candidates: return []
|
| 460 |
ce_scores = reranker.predict([[query, c["text"]] for c in candidates])
|
|
|
|
| 469 |
return []
|
| 470 |
|
| 471 |
def get_economic_chunks(texts: list, max_chunks: int = 40) -> list:
|
| 472 |
+
n = len(texts)
|
| 473 |
econ = [t for t in texts if any(kw in t.lower() for kw in ECON_TRIGGER)]
|
| 474 |
if len(econ) < 10:
|
| 475 |
start = texts[:min(10, n)]
|
|
|
|
| 479 |
if len(econ) > max_chunks:
|
| 480 |
step = max(1, len(econ) // max_chunks)
|
| 481 |
sample = econ[::step][:max_chunks]
|
| 482 |
+
else:
|
| 483 |
+
sample = econ
|
| 484 |
return sample
|
| 485 |
|
| 486 |
def llm_groq(question, rag_context, history, lang):
|
|
|
|
| 493 |
"- Be concise, helpful, accurate."
|
| 494 |
)
|
| 495 |
messages = [{"role": "system", "content": system_prompt}]
|
| 496 |
+
for turn in history[-4:]:
|
| 497 |
+
messages.append({"role": turn["role"], "content": turn["content"]})
|
| 498 |
user_content = f"๐ Context:\n{rag_context}\n\nQuestion: {question}" if rag_context else question
|
| 499 |
messages.append({"role": "user", "content": user_content})
|
| 500 |
try:
|
| 501 |
+
r = groq_client.chat.completions.create(
|
| 502 |
+
model="llama-3.3-70b-versatile",
|
| 503 |
+
messages=messages,
|
| 504 |
+
temperature=0.3,
|
| 505 |
+
max_tokens=512,
|
| 506 |
+
)
|
| 507 |
return r.choices[0].message.content.strip()
|
| 508 |
+
except Exception as e:
|
| 509 |
+
return f"โ ๏ธ Groq error: {e}"
|
| 510 |
|
| 511 |
def smart_answer(question, history):
|
| 512 |
+
lang = detect_lang(question)
|
| 513 |
results = rag_retrieve(question, k=5, top_n=3)
|
| 514 |
rag_context = ""
|
| 515 |
if results:
|
| 516 |
+
for r in results:
|
| 517 |
+
rag_context += f"[Source: {r['file']} - Page {r['page']}]\n{r['text']}\n\n"
|
| 518 |
has_good_rag = bool(results) and results[0]["sem"] >= 0.25
|
| 519 |
+
answer_text = llm_groq(question, rag_context[:2000], history, lang)
|
| 520 |
if has_good_rag:
|
| 521 |
+
src = ", ".join(f"`{r['file']}` p.{r['page']}" for r in results)
|
| 522 |
badge = f"\n\n๐ **{'ุงูู
ุตุฏุฑ' if lang=='ar' else 'Source'}:** {src}"
|
| 523 |
CHAT_STATS["found"] += 1
|
| 524 |
else:
|
|
|
|
| 530 |
def predict_with_rag(text):
|
| 531 |
text = "" if text is None else str(text).strip()
|
| 532 |
if not text: raise gr.Error("โ ๏ธ Enter text first.")
|
| 533 |
+
lang = detect_lang(text)
|
| 534 |
+
qterms = [t for t in re.findall(r"\w+", text.lower()) if len(t) > 2]
|
| 535 |
exact_hits = []
|
| 536 |
for i, chunk in enumerate(KB_TEXTS):
|
| 537 |
if not isinstance(chunk, str): continue
|
|
|
|
| 541 |
if re.search(rf"\b{re.escape(term)}\b", cl):
|
| 542 |
for s in re.split(r"(?<=[.!?ุ\n])\s+", chunk):
|
| 543 |
if re.search(rf"\b{re.escape(term)}\b", s.lower()):
|
| 544 |
+
exact_hits.append({
|
| 545 |
+
"word": term, "file": KB_META[i]["name"],
|
| 546 |
+
"sentence": s.strip(), "lang": KB_META[i]["lang"],
|
| 547 |
+
"chunk_id": i, "page": KB_META[i]["page"],
|
| 548 |
+
})
|
| 549 |
except: continue
|
| 550 |
+
|
| 551 |
sem_results, md = rag_retrieve(text, k=5, top_n=3), ""
|
| 552 |
if exact_hits:
|
| 553 |
seen, unique = set(), []
|
|
|
|
| 566 |
k2 = (h["file"], h["chunk_id"])
|
| 567 |
if k2 in seen2: continue
|
| 568 |
seen2.add(k2)
|
| 569 |
+
md += f"### ๐ `{h['file']}` โ p.{h['page']} {'๐ธ๐ฆ' if h['lang']=='ar' else '๐บ๐ธ'}\n\n```\n{KB_TEXTS[h['chunk_id']]}\n```\n\n"
|
| 570 |
else:
|
| 571 |
sent, conf = "โ Not found", 0.0
|
| 572 |
+
if lang == "ar":
|
| 573 |
+
md += f"## โ ุงูููู
ุฉ ุบูุฑ ู
ูุฌูุฏุฉ\n\n**`{text}`** ูู
ุชูุฐูุฑ ุญุฑููุงู.\n\n"
|
| 574 |
+
else:
|
| 575 |
+
md += f"## โ Word Not Found\n\n**`{text}`** not found literally.\n\n"
|
| 576 |
+
|
| 577 |
if sem_results:
|
| 578 |
md += "---\n## ๐ Semantic Results\n\n"
|
| 579 |
for r in sem_results:
|
| 580 |
+
bar = "๐ฉ"*round(r["sem"]*10) + "โฌ"*(10-round(r["sem"]*10))
|
| 581 |
snippet = r["text"][:300].strip()
|
| 582 |
for t in qterms:
|
| 583 |
try: snippet = re.sub(rf"(?i)({re.escape(t)})", r"**\1**", snippet)
|
| 584 |
except: pass
|
| 585 |
+
md += (
|
| 586 |
+
f"### Result {r['rank']} โ {bar} `{r['sem']*100:.1f}%` "
|
| 587 |
+
f"{'๐ธ๐ฆ' if r['lang']=='ar' else '๐บ๐ธ'}\n\n"
|
| 588 |
+
f"๐ `{r['file']}` p.{r['page']}\n\n> {snippet}...\n\n"
|
| 589 |
+
)
|
| 590 |
+
else:
|
| 591 |
+
md += "---\n_No similar content found._\n"
|
| 592 |
return sent, round(conf, 4), md
|
| 593 |
|
| 594 |
# ============================================================
|
| 595 |
+
# ECONOMETRICS โ World Bank + ARIMA/SARIMAX
|
| 596 |
# ============================================================
|
| 597 |
def get_worldbank_data(country_code, indicator, start_year, end_year):
|
| 598 |
+
url = (
|
| 599 |
+
f"https://api.worldbank.org/v2/country/{country_code}/"
|
| 600 |
+
f"indicator/{indicator}?date={start_year}:{end_year}&per_page=100&format=json"
|
| 601 |
+
)
|
| 602 |
try:
|
| 603 |
resp = requests.get(url, timeout=15)
|
| 604 |
resp.raise_for_status()
|
| 605 |
data = resp.json()
|
| 606 |
if not data or len(data) < 2 or not data[1]: return pd.DataFrame()
|
| 607 |
+
rows = [
|
| 608 |
+
{"year": int(e["date"]), "value": float(e["value"])}
|
| 609 |
+
for e in data[1]
|
| 610 |
+
if e.get("value") is not None and e.get("date") is not None
|
| 611 |
+
]
|
| 612 |
return pd.DataFrame(rows).dropna().sort_values("year").reset_index(drop=True)
|
| 613 |
except Exception as e:
|
| 614 |
print(f"World Bank error: {e}")
|
|
|
|
| 625 |
sample = get_economic_chunks(texts, max_chunks=40)
|
| 626 |
scores = [sentiment_score_numeric(t) for t in sample]
|
| 627 |
avg = round(float(np.mean(scores)), 4)
|
| 628 |
+
year = next(
|
| 629 |
+
(m["year"] for m in KB_META if m["name"]==fname and m.get("year")), None
|
| 630 |
+
)
|
| 631 |
file_results.append({
|
| 632 |
+
"file": fname, "year": year if year else "N/A",
|
| 633 |
+
"sentiment": avg, "n_chunks": len(sample),
|
| 634 |
+
"label": "๐ข Optimistic" if avg > 0.05 else "๐ด Pessimistic" if avg < -0.05 else "๐ก Neutral",
|
| 635 |
})
|
| 636 |
+
if year:
|
| 637 |
+
yearly_sentiment.setdefault(year, []).append(avg)
|
| 638 |
|
| 639 |
+
yearly_avg = {
|
| 640 |
+
yr: round(float(np.mean(vals)), 4)
|
| 641 |
+
for yr, vals in yearly_sentiment.items()
|
| 642 |
+
}
|
| 643 |
df_files = pd.DataFrame(file_results).sort_values("year")
|
| 644 |
+
df_yearly = (
|
| 645 |
+
pd.DataFrame([{"year": y, "sentiment": s} for y, s in sorted(yearly_avg.items())])
|
| 646 |
+
if yearly_avg else None
|
| 647 |
+
)
|
| 648 |
return df_files, df_yearly
|
| 649 |
|
| 650 |
def run_adf_check(series: np.ndarray, name: str):
|
|
|
|
| 652 |
def adf_p(s):
|
| 653 |
try: return adfuller(s, autolag='AIC')[1]
|
| 654 |
except: return 1.0
|
|
|
|
| 655 |
s = series.copy()
|
| 656 |
p0 = adf_p(s)
|
| 657 |
+
if p0 <= 0.05:
|
| 658 |
+
return s, f"โ
Stationary at level (p={p0:.4f})", False
|
| 659 |
s1 = np.diff(s)
|
| 660 |
p1 = adf_p(s1)
|
| 661 |
+
if p1 <= 0.05:
|
| 662 |
+
return s1, f"โ ๏ธ Non-stationary (p={p0:.4f}) โ 1st diff โ โ
stationary (p={p1:.4f})", True
|
| 663 |
s2 = np.diff(s1)
|
| 664 |
p2 = adf_p(s2)
|
| 665 |
+
return (
|
| 666 |
+
s2,
|
| 667 |
+
f"โ ๏ธ Non-stationary (p={p0:.4f}) โ 1st diff (p={p1:.4f}) โ 2nd diff โ "
|
| 668 |
+
f"{'โ
stationary' if p2<=0.05 else 'โ ๏ธ non-stationary'} (p={p2:.4f})",
|
| 669 |
+
True,
|
| 670 |
+
)
|
| 671 |
|
| 672 |
def run_granger_test(series_y, series_exog, maxlag=4):
|
| 673 |
try:
|
| 674 |
from statsmodels.tsa.stattools import grangercausalitytests
|
| 675 |
+
if len(series_y) < 10:
|
| 676 |
+
return "โ ๏ธ **Granger Test skipped** โ need โฅ 10 points.", False
|
| 677 |
sy, status_y = run_adf_check(series_y.copy(), "Target")[:2]
|
| 678 |
sexog, status_exog = run_adf_check(series_exog.copy(), "Sentiment")[:2]
|
|
|
|
| 679 |
min_len = min(len(sy), len(sexog))
|
| 680 |
sy, sexog = sy[-min_len:], sexog[-min_len:]
|
| 681 |
maxlag = min(maxlag, max(1, (len(sy) - 1) // 3))
|
| 682 |
+
if len(sy) < 5:
|
| 683 |
+
return "โ ๏ธ **Granger Test skipped** โ too few obs after differencing.", False
|
| 684 |
+
gc_result = grangercausalitytests(
|
| 685 |
+
np.column_stack([sy, sexog]), maxlag=maxlag, verbose=False
|
| 686 |
+
)
|
| 687 |
rows, any_pass, best_p = [], False, 1.0
|
| 688 |
for lag, res in gc_result.items():
|
| 689 |
p_val = res[0]["ssr_ftest"][1]
|
| 690 |
+
f_val = res[0]["ssr_ftest"][0]
|
| 691 |
+
if p_val < 0.05: sig = "โ
Yes"; any_pass = True
|
| 692 |
+
elif p_val < 0.10: sig = "๐ถ Marginal"
|
| 693 |
+
else: sig = "โ No"
|
| 694 |
best_p = min(best_p, p_val)
|
| 695 |
+
rows.append(f"| {lag} | {f_val:.4f} | {p_val:.4f} | {sig} |")
|
| 696 |
|
| 697 |
table = (
|
| 698 |
"### ๐ฌ Granger Causality Test\n"
|
| 699 |
"*Hโ: Sentiment does NOT Granger-cause Target*\n\n"
|
| 700 |
+
f"#### ๐ ADF Stationarity Pre-check\n\n"
|
| 701 |
+
f"| Series | ADF Result |\n|---|---|\n"
|
| 702 |
+
f"| ๐ฏ Target | {status_y} |\n"
|
| 703 |
+
f"| ๐ Sentiment | {status_exog} |\n\n"
|
| 704 |
+
"#### ๐ Granger Results\n\n"
|
| 705 |
+
"| Lag | F-stat | p-value | Significant? |\n|-----|--------|---------|-------------|\n"
|
| 706 |
+
+ "\n".join(rows)
|
| 707 |
)
|
| 708 |
+
if any_pass:
|
| 709 |
+
verdict = f"\n\nโ
**PASS** โ Sentiment significantly Granger-causes the target (p < 0.05)."
|
| 710 |
+
elif best_p < 0.10:
|
| 711 |
+
verdict = f"\n\n๐ถ **MARGINAL** โ best p = {best_p:.4f} (< 0.10)."
|
| 712 |
+
else:
|
| 713 |
+
verdict = "\n\nโ **FAIL** โ No significant Granger causality (p โฅ 0.05)."
|
| 714 |
return table + verdict, any_pass
|
| 715 |
+
except Exception as e:
|
| 716 |
+
return f"โ ๏ธ Granger test error: `{e}`\n", False
|
| 717 |
|
| 718 |
def run_dm_test(actual, pred_arima, pred_sarimax):
|
| 719 |
try:
|
| 720 |
n = len(actual)
|
| 721 |
+
if n < 3:
|
| 722 |
+
return "โ ๏ธ **DM Test skipped** โ n < 3.", False
|
| 723 |
+
d = (actual - pred_arima)**2 - (actual - pred_sarimax)**2
|
| 724 |
+
d_mean = np.mean(d)
|
| 725 |
+
d_std = np.std(d, ddof=1)
|
| 726 |
+
if d_std < 1e-10:
|
| 727 |
+
return "โ ๏ธ **DM Test** โ models identical.", False
|
| 728 |
dm_stat = d_mean / (d_std / np.sqrt(n))
|
| 729 |
p_val = 2 * (1 - stats.t.cdf(abs(dm_stat), df=n - 1))
|
| 730 |
sig = "โ
Yes" if p_val < 0.05 else ("๐ถ Marginal" if p_val < 0.10 else "โ No")
|
|
|
|
| 732 |
|
| 733 |
table = (
|
| 734 |
"### ๐ฏ Diebold-Mariano Test\n"
|
| 735 |
+
"*Hโ: Equal predictive accuracy | Hโ: SARIMAX better than ARIMA*\n\n"
|
| 736 |
"| DM Statistic | p-value | n (test) | Significant? | Better Model |\n"
|
| 737 |
"|-------------|---------|----------|-------------|-------------|\n"
|
| 738 |
f"| `{dm_stat:.4f}` | `{p_val:.4f}` | `{n}` | {sig} | **{better}** |\n"
|
| 739 |
)
|
| 740 |
passed = p_val < 0.05 and dm_stat > 0
|
| 741 |
+
if passed:
|
| 742 |
+
verdict = "\nโ
**PASS** โ SARIMAX+Ensemble is **significantly better** (p < 0.05)."
|
| 743 |
+
elif (p_val < 0.10) and dm_stat > 0:
|
| 744 |
+
verdict = f"\n๐ถ **MARGINAL** โ p = {p_val:.4f} (< 0.10)."
|
| 745 |
+
else:
|
| 746 |
+
verdict = (
|
| 747 |
+
f"\nโ **FAIL** โ Not statistically significant (p = {p_val:.4f}).\n\n"
|
| 748 |
+
f"> ๐ก With n = {n} test points, power is limited. "
|
| 749 |
+
f"Expand Start Year to 1990 for more test data."
|
| 750 |
+
)
|
| 751 |
return table + verdict, passed
|
| 752 |
+
except Exception as e:
|
| 753 |
+
return f"โ ๏ธ DM error: `{e}`\n", False
|
| 754 |
|
| 755 |
+
# ============================================================
|
| 756 |
+
# MAIN FORECAST FUNCTION โ n = 3
|
| 757 |
+
# ============================================================
|
| 758 |
def run_economic_forecast(country_code, target_var, start_year, end_year):
|
| 759 |
try:
|
| 760 |
from statsmodels.tsa.arima.model import ARIMA
|
| 761 |
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
| 762 |
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
| 763 |
+
except ImportError:
|
| 764 |
+
return "โ pip install statsmodels scikit-learn", None
|
| 765 |
|
| 766 |
indicator_map = {
|
| 767 |
"Inflation (CPI %)" : "FP.CPI.TOTL.ZG",
|
|
|
|
| 769 |
"Unemployment (%) ": "SL.UEM.TOTL.ZS",
|
| 770 |
"Exchange Rate" : "PA.NUS.FCRF",
|
| 771 |
}
|
| 772 |
+
econ_df = get_worldbank_data(
|
| 773 |
+
country_code,
|
| 774 |
+
indicator_map.get(target_var, "FP.CPI.TOTL.ZG"),
|
| 775 |
+
int(start_year), int(end_year),
|
| 776 |
+
)
|
| 777 |
+
if econ_df.empty:
|
| 778 |
+
return f"โ No data for **{country_code}** / **{target_var}**", None
|
| 779 |
+
if len(econ_df) < 5:
|
| 780 |
+
return f"โ ๏ธ Only **{len(econ_df)}** data points. Widen year range.", None
|
| 781 |
|
| 782 |
df_files, df_yearly = build_doc_sentiment_index()
|
| 783 |
|
| 784 |
if df_yearly is not None and len(df_yearly) >= 2:
|
| 785 |
+
merged = econ_df.merge(df_yearly, on="year", how="left")
|
| 786 |
+
merged["sentiment"] = merged["sentiment"].fillna(
|
| 787 |
+
float(df_yearly["sentiment"].mean())
|
| 788 |
+
)
|
| 789 |
+
has_yearly = True
|
| 790 |
+
mode_msg = "โ
**Yearly Ensemble Sentiment**"
|
| 791 |
else:
|
| 792 |
+
global_sent = (
|
| 793 |
+
float(pd.to_numeric(df_files["sentiment"], errors="coerce").mean())
|
| 794 |
+
if df_files is not None and len(df_files) > 0 else 0.0
|
| 795 |
+
)
|
| 796 |
merged = econ_df.copy()
|
| 797 |
merged["sentiment"] = global_sent
|
| 798 |
+
has_yearly = False
|
| 799 |
+
mode_msg = "โ ๏ธ **Global Sentiment**"
|
| 800 |
|
| 801 |
if merged["sentiment"].std() > 1e-6:
|
| 802 |
scaler = MinMaxScaler(feature_range=(-0.3, 0.3))
|
| 803 |
+
merged["sentiment"] = scaler.fit_transform(
|
| 804 |
+
merged["sentiment"].values.reshape(-1, 1)
|
| 805 |
+
).flatten().round(4)
|
| 806 |
|
| 807 |
series = merged["value"].values.astype(float)
|
| 808 |
exog = merged["sentiment"].values.reshape(-1, 1)
|
| 809 |
years = merged["year"].values
|
| 810 |
n = len(series)
|
| 811 |
|
| 812 |
+
# ============================================================
|
| 813 |
+
# โ
n = 3 โ Test on last 3 years
|
| 814 |
+
# ============================================================
|
| 815 |
split = n - 3
|
| 816 |
+
if split < 5:
|
| 817 |
+
split = max(int(n * 0.75), 5) # safety fallback for very short series
|
| 818 |
|
| 819 |
+
train_y, test_y = series[:split], series[split:]
|
| 820 |
train_exog, test_exog = exog[:split], exog[split:]
|
| 821 |
test_years = years[split:]
|
| 822 |
|
| 823 |
+
# ARIMA baseline
|
| 824 |
try:
|
| 825 |
+
m1 = ARIMA(train_y, order=(1,1,1)).fit()
|
| 826 |
pred_arima = m1.forecast(len(test_y))
|
| 827 |
rmse_a = float(np.sqrt(mean_squared_error(test_y, pred_arima)))
|
| 828 |
mae_a = float(mean_absolute_error(test_y, pred_arima))
|
| 829 |
mape_a = float(np.mean(np.abs((test_y-pred_arima)/np.maximum(np.abs(test_y),1e-8)))*100)
|
| 830 |
+
except Exception as e:
|
| 831 |
+
return f"โ ARIMA error: {e}", None
|
| 832 |
|
| 833 |
+
# SARIMAX + Ensemble Sentiment
|
| 834 |
try:
|
| 835 |
+
m2 = SARIMAX(train_y, exog=train_exog, order=(1,1,1)).fit(disp=False)
|
| 836 |
pred_sarimax = m2.forecast(len(test_y), exog=test_exog)
|
| 837 |
rmse_s = float(np.sqrt(mean_squared_error(test_y, pred_sarimax)))
|
| 838 |
mae_s = float(mean_absolute_error(test_y, pred_sarimax))
|
| 839 |
mape_s = float(np.mean(np.abs((test_y-pred_sarimax)/np.maximum(np.abs(test_y),1e-8)))*100)
|
| 840 |
+
except Exception as e:
|
| 841 |
+
return f"โ SARIMAX error: {e}", None
|
| 842 |
|
| 843 |
impr_rmse = (rmse_a - rmse_s) / rmse_a * 100
|
| 844 |
impr_mae = (mae_a - mae_s) / mae_a * 100
|
| 845 |
impr_mape = (mape_a - mape_s) / mape_a * 100
|
| 846 |
|
| 847 |
+
# Granger โ use full series
|
| 848 |
if has_yearly and df_yearly is not None and len(df_yearly) >= 5:
|
| 849 |
real_merged = econ_df.merge(df_yearly, on="year", how="inner")
|
| 850 |
+
gc_y = real_merged["value"].values.astype(float)
|
| 851 |
+
gc_exog = real_merged["sentiment"].values.astype(float)
|
| 852 |
else:
|
| 853 |
+
gc_y = series
|
| 854 |
+
gc_exog = merged["sentiment"].values
|
| 855 |
|
| 856 |
granger_md, granger_pass = run_granger_test(gc_y, gc_exog, maxlag=4)
|
| 857 |
dm_md, dm_pass = run_dm_test(test_y, np.array(pred_arima), np.array(pred_sarimax))
|
| 858 |
|
| 859 |
+
# ============================================================
|
| 860 |
+
# PLOTS
|
| 861 |
+
# ============================================================
|
| 862 |
fig, axes = plt.subplots(4, 1, figsize=(11, 18))
|
| 863 |
+
|
| 864 |
+
# Plot 1 โ Forecast
|
| 865 |
axes[0].plot(years, series, "o-", color="#2196F3", label="Actual", lw=2, ms=5)
|
| 866 |
+
axes[0].plot(test_years, pred_arima, "s--", color="#FF5722", label="ARIMA(1,1,1)", lw=2)
|
| 867 |
axes[0].plot(test_years, pred_sarimax, "^-.", color="#4CAF50", label="SARIMAX+Ensemble", lw=2)
|
| 868 |
axes[0].axvline(x=years[split-1], color="gray", linestyle=":", alpha=0.7, label="TrainโTest")
|
| 869 |
+
axes[0].set_title(
|
| 870 |
+
f"๐ {target_var} โ {country_code} (Yearly Ensemble Sentiment) | n_test={len(test_y)}",
|
| 871 |
+
fontsize=11, fontweight="bold",
|
| 872 |
+
)
|
| 873 |
+
axes[0].set_xlabel("Year"); axes[0].set_ylabel(target_var)
|
| 874 |
axes[0].legend(fontsize=9); axes[0].grid(True, alpha=0.3)
|
| 875 |
|
| 876 |
+
# Plot 2 โ Sentiment Index
|
| 877 |
+
s_clrs = [
|
| 878 |
+
"#4CAF50" if s > 0.05 else "#FF5722" if s < -0.05 else "#FFC107"
|
| 879 |
+
for s in merged["sentiment"]
|
| 880 |
+
]
|
| 881 |
axes[1].bar(years, merged["sentiment"], color=s_clrs, edgecolor="white", width=0.6)
|
| 882 |
axes[1].axhline(y=0, color="black", lw=0.8)
|
| 883 |
+
legend_patches = [
|
| 884 |
+
Patch(color="#4CAF50", label="Optimistic (>0.05)"),
|
| 885 |
+
Patch(color="#FFC107", label="Neutral"),
|
| 886 |
+
Patch(color="#FF5722", label="Pessimistic (<-0.05)"),
|
| 887 |
+
]
|
| 888 |
+
axes[1].legend(handles=legend_patches, fontsize=8, loc="upper right")
|
| 889 |
+
axes[1].set_title(
|
| 890 |
+
"๐ Ensemble Sentiment Index (FinBERT 40% + XLM 30% + Lexicon 30%)\n"
|
| 891 |
+
"per-year โ normalized [-0.3, +0.3]",
|
| 892 |
+
fontsize=10, fontweight="bold",
|
| 893 |
+
)
|
| 894 |
+
axes[1].set_xlabel("Year"); axes[1].set_ylabel("Sentiment Score")
|
| 895 |
axes[1].grid(True, alpha=0.3, axis="y")
|
| 896 |
|
| 897 |
+
# Plot 3 โ RMSE Bar
|
| 898 |
+
better_color_a = "#4CAF50" if rmse_a <= rmse_s else "#4CAF50"
|
| 899 |
+
better_color_s = "#4CAF50" if rmse_s <= rmse_a else "#4CAF50"
|
| 900 |
+
bar_colors = ["#FF5722" if rmse_a > rmse_s else "#4CAF50",
|
| 901 |
+
"#4CAF50" if rmse_s <= rmse_a else "#FF5722"]
|
| 902 |
+
bars = axes[2].bar(
|
| 903 |
+
["ARIMA(1,1,1)", "SARIMAX\n+Ensemble"],
|
| 904 |
+
[rmse_a, rmse_s], color=bar_colors, width=0.4, edgecolor="white",
|
| 905 |
+
)
|
| 906 |
for bar, val in zip(bars, [rmse_a, rmse_s]):
|
| 907 |
+
axes[2].text(
|
| 908 |
+
bar.get_x()+bar.get_width()/2, bar.get_height()+0.01,
|
| 909 |
+
f"{val:.4f}", ha="center", va="bottom", fontweight="bold", fontsize=11,
|
| 910 |
+
)
|
| 911 |
+
axes[2].set_title("๐ RMSE Comparison (lower = better)", fontsize=11)
|
| 912 |
+
axes[2].set_ylabel("RMSE"); axes[2].grid(True, alpha=0.3, axis="y")
|
| 913 |
|
| 914 |
+
# Plot 4 โ Statistical Tests Summary Table
|
| 915 |
axes[3].axis("off")
|
| 916 |
test_data = [
|
| 917 |
["Test", "Result", "Interpretation"],
|
| 918 |
+
[
|
| 919 |
+
"Granger (ADF + Granger)",
|
| 920 |
+
"โ
PASS" if granger_pass else "โ FAIL",
|
| 921 |
+
"Sentiment Granger-causes Target" if granger_pass else "No causal link detected",
|
| 922 |
+
],
|
| 923 |
+
[
|
| 924 |
+
"Diebold-Mariano\n(SARIMAX vs ARIMA)",
|
| 925 |
+
"โ
PASS" if dm_pass else "โ FAIL",
|
| 926 |
+
"SARIMAX significantly better" if dm_pass else f"n_test={len(test_y)} โ limited power",
|
| 927 |
+
],
|
| 928 |
]
|
| 929 |
+
tbl4 = axes[3].table(
|
| 930 |
+
cellText=test_data[1:], colLabels=test_data[0],
|
| 931 |
+
cellLoc="center", loc="center", colWidths=[0.35, 0.2, 0.45],
|
| 932 |
+
)
|
| 933 |
tbl4.auto_set_font_size(False); tbl4.set_fontsize(11); tbl4.scale(1, 2.5)
|
| 934 |
for (row, col), cell in tbl4.get_celld().items():
|
| 935 |
+
if row == 0:
|
| 936 |
+
cell.set_facecolor("#1565C0")
|
| 937 |
+
cell.set_text_props(color="white", fontweight="bold")
|
| 938 |
+
elif row == 1:
|
| 939 |
+
cell.set_facecolor("#E8F5E9" if granger_pass else "#FFEBEE")
|
| 940 |
+
elif row == 2:
|
| 941 |
+
cell.set_facecolor("#E8F5E9" if dm_pass else "#FFEBEE")
|
| 942 |
+
axes[3].set_title(
|
| 943 |
+
"๐ฌ Statistical Tests: ADF + Granger + DM",
|
| 944 |
+
fontsize=12, fontweight="bold", pad=20,
|
| 945 |
+
)
|
| 946 |
|
| 947 |
plt.tight_layout(pad=3.0)
|
| 948 |
img_path = "/tmp/forecast_plot.png"
|
| 949 |
+
plt.savefig(img_path, dpi=130, bbox_inches="tight")
|
| 950 |
+
plt.close(fig)
|
| 951 |
|
| 952 |
+
# ============================================================
|
| 953 |
+
# RESULT TEXT
|
| 954 |
+
# ============================================================
|
| 955 |
sent_table = ""
|
| 956 |
if df_files is not None and len(df_files) > 0:
|
| 957 |
+
sent_table = (
|
| 958 |
+
"\n---\n### ๐ Ensemble Sentiment per File\n"
|
| 959 |
+
"| ๐ File | ๐
Year | ๐ Score | ๐ฆ Chunks | Label |\n|---|---|---|---|---|\n"
|
| 960 |
+
)
|
| 961 |
+
for _, row in df_files.iterrows():
|
| 962 |
+
sent_table += (
|
| 963 |
+
f"| `{row['file']}` | {row['year']} | "
|
| 964 |
+
f"`{row['sentiment']:+.4f}` | {row['n_chunks']} | {row['label']} |\n"
|
| 965 |
+
)
|
| 966 |
|
| 967 |
result_md = (
|
| 968 |
+
f"## ๐ Forecast โ {country_code} / {target_var}\n\n"
|
| 969 |
+
f"| | |\n|---|---|\n"
|
| 970 |
+
f"| ๐ฏ Target Variable | **{target_var}** |\n"
|
| 971 |
+
f"| ๐ Sentiment Mode | {mode_msg} |\n"
|
| 972 |
+
f"| ๐ Train samples | **{split}** |\n"
|
| 973 |
+
f"| ๐งช Test samples (n)| **{len(test_y)}** |\n\n"
|
| 974 |
+
f"---\n### ๐ Model Comparison\n"
|
| 975 |
+
f"| Model | RMSE | MAE | MAPE |\n|---|---|---|---|\n"
|
| 976 |
+
f"| ARIMA(1,1,1) | `{rmse_a:.4f}` | `{mae_a:.4f}` | `{mape_a:.1f}%` |\n"
|
| 977 |
+
f"| SARIMAX+Ensemble | `{rmse_s:.4f}` | `{mae_s:.4f}` | `{mape_s:.1f}%` |\n"
|
| 978 |
+
f"| **Improvement** | **{impr_rmse:+.1f}%** | **{impr_mae:+.1f}%** | **{impr_mape:+.1f}%** |\n\n"
|
| 979 |
+
f"{'โ
**Improved** by adding Ensemble Sentiment Index.' if impr_rmse > 0 else 'โ ๏ธ No RMSE improvement for this variable.'}\n\n"
|
| 980 |
f"---\n{granger_md}\n\n---\n{dm_md}\n{sent_table}"
|
| 981 |
)
|
| 982 |
return result_md, img_path
|
| 983 |
|
| 984 |
# ============================================================
|
| 985 |
+
# UTILITIES
|
| 986 |
# ============================================================
|
| 987 |
def generate_report(text, sent, conf, md):
|
| 988 |
path = "/tmp/report.md"
|
| 989 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 990 |
+
f.write(f"# Report\n\n**Input:** {text}\n**Sentiment:** {sent}\n\n{md}")
|
| 991 |
return path
|
| 992 |
+
|
| 993 |
def export_chat(history):
|
| 994 |
path = "/tmp/chat.txt"
|
| 995 |
with open(path, "w", encoding="utf-8") as f:
|
| 996 |
+
for turn in history:
|
| 997 |
+
f.write(f"{turn['role']}:\n{turn['content']}\n\n")
|
| 998 |
return path
|
| 999 |
+
|
| 1000 |
+
def get_stats():
|
| 1001 |
+
return (
|
| 1002 |
+
f"### ๐ Session Stats\n\n"
|
| 1003 |
+
f"| | |\n|---|---|\n"
|
| 1004 |
+
f"| โ Questions asked | **{CHAT_STATS['questions']}** |\n"
|
| 1005 |
+
f"| โ
RAG answers | **{CHAT_STATS['found']}** |\n"
|
| 1006 |
+
f"| ๐ค General answers | **{CHAT_STATS['not_found']}** |\n"
|
| 1007 |
+
f"| ๐ฆ Chunks indexed | **{len(KB_TEXTS):,}** |\n"
|
| 1008 |
+
)
|
| 1009 |
+
|
| 1010 |
def get_top_keywords():
|
| 1011 |
+
if not KB_TEXTS: return "_No files uploaded yet._"
|
| 1012 |
+
stopwords = {"this","that","with","from","have","been","were","they","their",
|
| 1013 |
+
"there","what","when","which","will","also","than","into","more"}
|
| 1014 |
+
top = Counter(
|
| 1015 |
+
w for w in re.findall(r"\b\w{4,}\b", " ".join(KB_TEXTS).lower())
|
| 1016 |
+
if w not in stopwords
|
| 1017 |
+
).most_common(20)
|
| 1018 |
+
return "### ๐ Top 20 Keywords\n\n" + "\n".join(f"- **{w}**: {c}" for w,c in top)
|
| 1019 |
+
|
| 1020 |
+
def update_threshold(val):
|
| 1021 |
+
global MIN_SIMILARITY
|
| 1022 |
+
MIN_SIMILARITY = val
|
| 1023 |
+
return f"โ
Threshold set to: {val:.0%}"
|
| 1024 |
+
|
| 1025 |
def chat_text(message, history):
|
| 1026 |
if not message.strip(): return "", history
|
| 1027 |
answer, _ = smart_answer(message, history)
|
| 1028 |
+
return "", history + [
|
| 1029 |
+
{"role": "user", "content": message},
|
| 1030 |
+
{"role": "assistant", "content": answer},
|
| 1031 |
+
]
|
| 1032 |
+
|
| 1033 |
def tts_save(text, lang="en"):
|
| 1034 |
path = "/tmp/ans.mp3"
|
| 1035 |
+
gTTS(
|
| 1036 |
+
text=re.sub(r"[*`#>\[\]|_]", "", text)[:600],
|
| 1037 |
+
lang="ar" if lang == "ar" else "en",
|
| 1038 |
+
).save(path)
|
| 1039 |
return path
|
| 1040 |
+
|
| 1041 |
def chat_voice(audio, history):
|
| 1042 |
+
if audio is None: raise gr.Error("No audio received.")
|
| 1043 |
sr, y = audio
|
| 1044 |
y = np.array(y) if isinstance(y, list) else y
|
| 1045 |
if y.ndim > 1: y = y.mean(axis=1)
|
| 1046 |
transcript = asr({"array": y.astype(np.float32), "sampling_rate": sr})["text"]
|
| 1047 |
+
lang = detect_lang(transcript)
|
| 1048 |
+
answer, _ = smart_answer(transcript, history)
|
| 1049 |
+
new_history = history + [
|
| 1050 |
+
{"role": "user", "content": f"๐๏ธ {transcript}"},
|
| 1051 |
+
{"role": "assistant", "content": answer},
|
| 1052 |
+
]
|
| 1053 |
+
return new_history, tts_save(answer, lang), transcript
|
| 1054 |
+
|
| 1055 |
+
# ============================================================
|
| 1056 |
+
# GRADIO UI
|
| 1057 |
+
# ============================================================
|
| 1058 |
+
with gr.Blocks(title="RAG + Sentiment + Forecast", theme=gr.themes.Soft()) as app:
|
| 1059 |
+
gr.Markdown(
|
| 1060 |
+
"# ๐ค Hybrid Multilingual RAG + Ensemble Sentiment + Economic Forecast\n"
|
| 1061 |
+
"**ENSSEA โ Master's Thesis | Si Tayeb Houari | 2025โ2026**"
|
| 1062 |
+
)
|
| 1063 |
|
| 1064 |
+
# โโ Tab 1: Upload โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
|
|
|
| 1065 |
with gr.Tab("๐ 1 ยท Upload"):
|
| 1066 |
+
files = gr.File(
|
| 1067 |
+
label="๐ Upload Files (PDF / TXT / CSV / DOCX)",
|
| 1068 |
+
file_types=[".pdf",".txt",".csv",".docx"],
|
| 1069 |
+
file_count="multiple", type="filepath",
|
| 1070 |
+
)
|
| 1071 |
+
build_btn = gr.Button("๐จ Build Index", variant="primary")
|
| 1072 |
+
status = gr.Markdown("_No index built yet._")
|
| 1073 |
+
with gr.Row():
|
| 1074 |
+
save_btn = gr.Button("๐พ Save Index")
|
| 1075 |
+
load_btn = gr.Button("๐ Load Saved Index")
|
| 1076 |
+
persist_status = gr.Markdown()
|
| 1077 |
+
sim_slider = gr.Slider(0.0, 1.0, value=0.10, step=0.05, label="๐ฏ Similarity Threshold")
|
| 1078 |
+
threshold_status = gr.Markdown()
|
| 1079 |
+
build_btn.click(build_index, inputs=files, outputs=status)
|
| 1080 |
+
save_btn.click(save_index, outputs=persist_status)
|
| 1081 |
+
load_btn.click(load_saved_index, outputs=persist_status)
|
| 1082 |
+
sim_slider.change(update_threshold, inputs=sim_slider, outputs=threshold_status)
|
| 1083 |
+
|
| 1084 |
+
# โโ Tab 2: Sentiment & Word Search โโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1085 |
+
with gr.Tab("๐ญ 2 ยท Sentiment & Search"):
|
| 1086 |
+
inp = gr.Textbox(lines=3, label="๐ Enter text or keyword")
|
| 1087 |
+
run_btn = gr.Button("๐ Analyze & Search", variant="primary")
|
| 1088 |
+
with gr.Row():
|
| 1089 |
+
out_sent = gr.Textbox(label="๐ญ Sentiment")
|
| 1090 |
+
out_conf = gr.Number(label="๐ Score")
|
| 1091 |
+
out_full = gr.Markdown()
|
| 1092 |
+
rep_btn = gr.Button("๐ Download Report")
|
| 1093 |
+
rep_file = gr.File(label="๐ฅ Report")
|
| 1094 |
run_btn.click(predict_with_rag, inputs=inp, outputs=[out_sent, out_conf, out_full])
|
| 1095 |
+
rep_btn.click(generate_report, inputs=[inp, out_sent, out_conf, out_full], outputs=rep_file)
|
| 1096 |
|
| 1097 |
+
# โโ Tab 3: Smart Chatbot โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1098 |
+
with gr.Tab("๐ฌ 3 ยท Smart Chatbot"):
|
| 1099 |
+
chatbot = gr.Chatbot(height=430, type="messages", show_label=False)
|
| 1100 |
+
msg = gr.Textbox(placeholder="Ask anything about your documentsโฆ", label="๐ฌ Message")
|
| 1101 |
+
with gr.Row():
|
| 1102 |
+
send_btn = gr.Button("๐จ Send", variant="primary")
|
| 1103 |
+
clear_btn = gr.Button("๐๏ธ Clear")
|
| 1104 |
+
exp_btn = gr.Button("๐ฅ Export")
|
| 1105 |
+
exp_file = gr.File(label="๐พ Chat Export")
|
| 1106 |
+
msg.submit(chat_text, inputs=[msg, chatbot], outputs=[msg, chatbot])
|
| 1107 |
+
send_btn.click(chat_text, inputs=[msg, chatbot], outputs=[msg, chatbot])
|
| 1108 |
+
clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
|
| 1109 |
+
exp_btn.click(export_chat, inputs=chatbot, outputs=exp_file)
|
| 1110 |
+
|
| 1111 |
+
# โโ Tab 4: Voice โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1112 |
+
with gr.Tab("๐๏ธ 4 ยท Voice"):
|
| 1113 |
+
gr.Markdown("### ๐๏ธ Speak your question โ get a spoken answer")
|
| 1114 |
+
voice_input = gr.Audio(sources=["microphone"], type="numpy", label="๐ค Record")
|
| 1115 |
+
voice_btn = gr.Button("๐๏ธ Ask by Voice", variant="primary")
|
| 1116 |
+
voice_chat = gr.Chatbot(height=300, type="messages")
|
| 1117 |
+
audio_output = gr.Audio(label="๐ Answer", autoplay=True)
|
| 1118 |
+
transcript_out= gr.Textbox(label="๐ Transcript")
|
| 1119 |
+
voice_btn.click(chat_voice, inputs=[voice_input, voice_chat],
|
| 1120 |
+
outputs=[voice_chat, audio_output, transcript_out])
|
| 1121 |
+
|
| 1122 |
+
# โโ Tab 5: Analytics โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1123 |
+
with gr.Tab("๐ 5 ยท Analytics"):
|
| 1124 |
+
stats_btn = gr.Button("๐ Refresh Stats")
|
| 1125 |
+
stats_out = gr.Markdown()
|
| 1126 |
+
kw_btn = gr.Button("๐ Top Keywords")
|
| 1127 |
+
kw_out = gr.Markdown()
|
| 1128 |
+
stats_btn.click(get_stats, outputs=stats_out)
|
| 1129 |
+
kw_btn.click(get_top_keywords, outputs=kw_out)
|
| 1130 |
+
|
| 1131 |
+
# โโ Tab 6: About โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1132 |
+
with gr.Tab("โน๏ธ 6 ยท About"):
|
| 1133 |
+
gr.Markdown(
|
| 1134 |
+
"## ๐ค Hybrid Multilingual RAG Framework\n\n"
|
| 1135 |
+
"| Component | Details |\n|---|---|\n"
|
| 1136 |
+
"| ๐ซ School | ENSSEA โ รcole Nationale Supรฉrieure de Statistique et d'รconomie Appliquรฉe |\n"
|
| 1137 |
+
"| ๐ค Author | Si Tayeb Houari |\n"
|
| 1138 |
+
"| ๐
Year | 2025โ2026 |\n"
|
| 1139 |
+
"| ๐ Degree | Master's โ Statistics & Foresight Economics |\n\n"
|
| 1140 |
+
"### ๐ง Models Used\n"
|
| 1141 |
+
"- ๐ฆ **FinBERT** (ProsusAI) โ Financial sentiment (40%)\n"
|
| 1142 |
+
"- ๐ **XLM-RoBERTa** (CardiffNLP) โ Multilingual sentiment (30%)\n"
|
| 1143 |
+
"- ๐ **Economic Lexicon** โ Domain-specific keywords (30%)\n"
|
| 1144 |
+
"- ๐ **MiniLM-L12** โ Multilingual embeddings (FAISS)\n"
|
| 1145 |
+
"- ๐ **ms-marco-MiniLM** โ Cross-encoder reranking\n"
|
| 1146 |
+
"- ๐ฃ๏ธ **Whisper-small** โ ASR\n"
|
| 1147 |
+
"- ๐ค **Llama-3.3-70B** via Groq โ Response generation\n\n"
|
| 1148 |
+
"### ๐ Forecasting\n"
|
| 1149 |
+
"- Baseline: **ARIMA(1,1,1)**\n"
|
| 1150 |
+
"- Enhanced: **SARIMAX + Ensemble Sentiment** (n_test = 3)\n"
|
| 1151 |
+
"- Tests: **ADF**, **Granger Causality**, **Diebold-Mariano**\n"
|
| 1152 |
+
"- Data: **World Bank API**\n"
|
| 1153 |
+
)
|
| 1154 |
|
| 1155 |
+
# โโ Tab 7: Economic Forecast โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1156 |
with gr.Tab("๐ 7 ยท Forecast"):
|
| 1157 |
+
gr.Markdown(
|
| 1158 |
+
"## ๐ Economic Forecast โ ARIMA vs SARIMAX + Ensemble Sentiment\n"
|
| 1159 |
+
"> **n_test = 3** โ Evaluates on the last 3 years (captures recent economic turbulence)"
|
| 1160 |
+
)
|
| 1161 |
with gr.Row():
|
| 1162 |
+
country_input = gr.Textbox(
|
| 1163 |
+
value="DZ", label="๐ Country Code (ISO)",
|
| 1164 |
+
placeholder="e.g. DZ, MA, TN, EG, US",
|
| 1165 |
+
)
|
| 1166 |
+
target_input = gr.Dropdown(
|
| 1167 |
+
choices=[
|
| 1168 |
+
"Inflation (CPI %)",
|
| 1169 |
+
"GDP Growth (%) ",
|
| 1170 |
+
"Unemployment (%) ",
|
| 1171 |
+
"Exchange Rate",
|
| 1172 |
+
],
|
| 1173 |
+
value="Inflation (CPI %)",
|
| 1174 |
+
label="๐ฏ Target Variable",
|
| 1175 |
+
)
|
| 1176 |
with gr.Row():
|
| 1177 |
+
start_year = gr.Slider(
|
| 1178 |
+
minimum=1990, maximum=2020, value=2000, step=1, label="๐
Start Year"
|
| 1179 |
+
)
|
| 1180 |
+
end_year = gr.Slider(
|
| 1181 |
+
minimum=2010, maximum=2024, value=2023, step=1, label="๐
End Year"
|
| 1182 |
+
)
|
| 1183 |
+
forecast_btn = gr.Button("๐ Run Forecast", variant="primary", size="lg")
|
| 1184 |
+
forecast_result = gr.Markdown()
|
| 1185 |
+
forecast_plot = gr.Image(label="๐ Forecast Chart", type="filepath")
|
| 1186 |
+
forecast_btn.click(
|
| 1187 |
+
run_economic_forecast,
|
| 1188 |
+
inputs=[country_input, target_input, start_year, end_year],
|
| 1189 |
+
outputs=[forecast_result, forecast_plot],
|
| 1190 |
+
)
|
| 1191 |
|
| 1192 |
app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
|