Spaces:

Desalegnn
/

RAG

Sleeping

App Files Files Community

Desalegnn commited on Nov 29, 2025

Commit

1ca96e2

verified ·

1 Parent(s): edfe0a6

Update app.py

Browse files

Files changed (1) hide show

app.py +273 -61

app.py CHANGED Viewed

@@ -1,70 +1,282 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
     """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
     """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
     ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
-if __name__ == "__main__":
-    demo.launch()

+import time
+import json
+import numpy as np
+import faiss
+import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
+# -------------------------------------------------------
+# CONFIG
+# -------------------------------------------------------
+# Embedding model for retrieval
+EMBED_MODEL = "Desalegnn/Desu-snowflake-arctic-embed-l-v2.0-finetuned-amharic-45k"
+# Extractive QA model (generator/reader)
+QA_MODEL = "Desalegnn/afroxlmr-amharic-qa"
+# Local files in the Space repo (⚠️ make sure names match what you upload)
+FAISS_PATH    = "amharic_faiss.bin"    # upload this file
+METADATA_PATH = "passage_meta.jsonl"     # upload this file
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print("DEVICE:", DEVICE)
+# -------------------------------------------------------
+# LOAD MODELS + INDEX + METADATA
+# -------------------------------------------------------
+# 1) Embedding model
+embed_tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL)
+embed_model     = AutoModel.from_pretrained(EMBED_MODEL).to(DEVICE)
+embed_model.eval()
+# 2) QA model
+qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL)
+qa_model     = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL).to(DEVICE)
+qa_model.eval()
+# 3) FAISS index
+index = faiss.read_index(FAISS_PATH)
+print("FAISS dimension:", index.d)
+# 4) Passage metadata
+metadata = []
+with open(METADATA_PATH, "r", encoding="utf-8") as f:
+    for line in f:
+        line = line.strip()
+        if line:
+            metadata.append(json.loads(line))
+print("Loaded passages:", len(metadata))
+# -------------------------------------------------------
+# EMBEDDING FUNCTION
+# -------------------------------------------------------
+@torch.no_grad()
+def embed_texts(texts, batch_size=8):
+    """
+    Embed a list of texts using the Snowflake model (mean-pooled).
+    Returns np.ndarray of shape [N, D].
+    """
+    all_embs = []
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i + batch_size]
+        enc = embed_tokenizer(
+            batch,
+            padding=True,
+            truncation=True,
+            max_length=256,
+            return_tensors="pt",
+        ).to(DEVICE)
+        out = embed_model(**enc).last_hidden_state  # [B, T, D]
+        mask = enc["attention_mask"].unsqueeze(-1)  # [B, T, 1]
+        summed = (out * mask).sum(dim=1)           # [B, D]
+        counts = mask.sum(dim=1).clamp(min=1e-9)   # [B, 1]
+        emb = (summed / counts).cpu().numpy()      # [B, D]
+        all_embs.append(emb)
+    return np.vstack(all_embs).astype("float32")
+# -------------------------------------------------------
+# RETRIEVAL
+# -------------------------------------------------------
+def retrieve_top_k(query, k=5):
+    """
+    1) Embed query with Snowflake.
+    2) Search FAISS index.
+    3) Return top-k passages and retrieval latency (ms).
+    """
+    t0 = time.time()
+    query_emb = embed_texts([query])  # [1, D]
+    distances, indices = index.search(query_emb, k)
+    ret_latency = (time.time() - t0) * 1000.0  # ms
+    distances = distances[0]
+    indices   = indices[0]
+    results = []
+    for idx, dist in zip(indices, distances):
+        if 0 <= idx < len(metadata):
+            meta = metadata[idx]
+            results.append(
+                {
+                    "id": meta.get("id", idx),
+                    "text": meta.get("text", ""),
+                    "score": float(-dist),  # larger is better
+                }
+            )
+    return results, ret_latency
+# -------------------------------------------------------
+# EXTRACTIVE QA ON ONE PASSAGE
+# -------------------------------------------------------
+@torch.no_grad()
+def answer_on_context(question, passage):
     """
+    Apply AfroXLM-R QA model to (question, passage) and return best span + score.
     """
+    enc = qa_tokenizer(
+        question,
+        passage,
+        truncation="only_second",
+        max_length=384,
+        padding="max_length",
+        return_offsets_mapping=True,
+        return_tensors="pt",
+    )
+    input_ids      = enc["input_ids"].to(DEVICE)
+    attention_mask = enc["attention_mask"].to(DEVICE)
+    offset_mapping = enc["offset_mapping"][0].tolist()
+    sequence_ids   = enc.sequence_ids(0)  # 0 = question, 1 = context, None = special
+    outputs = qa_model(input_ids=input_ids, attention_mask=attention_mask)
+    start_logits = outputs.start_logits[0].cpu().numpy()
+    end_logits   = outputs.end_logits[0].cpu().numpy()
+    # mask out non-context tokens
+    for i, sid in enumerate(sequence_ids):
+        if sid != 1:
+            start_logits[i] = -1e9
+            end_logits[i]   = -1e9
+    start_idx = int(np.argmax(start_logits))
+    end_idx   = int(np.argmax(end_logits))
+    if end_idx < start_idx:
+        end_idx = start_idx
+    # convert to char positions
+    start_char, end_char = offset_mapping[start_idx][0], offset_mapping[end_idx][1]
+    if (
+        start_char is None
+        or end_char is None
+        or end_char <= start_char
+        or start_char < 0
+        or end_char > len(passage)
     ):
+        answer_text = ""
+    else:
+        answer_text = passage[start_char:end_char]
+    score = float(start_logits[start_idx] + end_logits[end_idx])
+    return answer_text.strip(), score
+# -------------------------------------------------------
+# RAG PIPELINE: RETRIEVE -> EXTRACTIVE QA
+# -------------------------------------------------------
+def rag_pipeline(question, k=5):
+    """
+    1) Retrieve top-k passages.
+    2) Run AfroXLM-R QA on each passage.
+    3) Select best answer by score.
+    4) Return answer, retrieval latency, generator latency, passage snippet.
+    """
+    # 1) Retrieval
+    passages, ret_lat = retrieve_top_k(question, k)
+    if not passages:
+        return (
+            "**Answer:** መረጃ አልተገኘም።",
+            f"**Retrieval Latency:** {ret_lat:.2f} ms",
+            "**Generator Latency:** 0.00 ms",
+            "",
+        )
+    # 2) QA on each passage
+    t0 = time.time()
+    best_answer = ""
+    best_score  = -1e9
+    best_passage_text = ""
+    for p in passages:
+        ctx = p["text"]
+        if not ctx.strip():
+            continue
+        ans, score = answer_on_context(question, ctx)
+        if ans and score > best_score:
+            best_score = score
+            best_answer = ans
+            best_passage_text = ctx
+    gen_lat = (time.time() - t0) * 1000.0  # ms
+    if not best_answer:
+        best_answer = "መልስ አልተገኘም።"
+    snippet = best_passage_text[:500] + ("..." if len(best_passage_text) > 500 else "")
+    return (
+        f"**Answer (AfroXLM-R extractive):** {best_answer}",
+        f"**Retrieval Latency:** {ret_lat:.2f} ms",
+        f"**Generator Latency (QA):** {gen_lat:.2f} ms",
+        snippet,
+    )
+# -------------------------------------------------------
+# GRADIO APP
+# -------------------------------------------------------
+def gradio_rag(query, k):
+    query = (query or "").strip()
+    if not query:
+        return "Please type a question.", "", "", ""
+    return rag_pipeline(query, int(k))
+with gr.Blocks() as app:
+    gr.Markdown("<h2>🇪🇹 Amharic RAG (Snowflake + AfroXLM-R Extractive QA)</h2>")
+    gr.Markdown(
+        "Retrieval-Augmented Question Answering: "
+        "Snowflake embeddings + FAISS for retrieval, "
+        "AfroXLM-R extractive model for answer spans."
+    )
+    with gr.Row():
+        query = gr.Textbox(
+            label="Ask an Amharic question",
+            lines=2,
+            placeholder="ምሳሌ፡ አባይ ወንዝ የት ነው የሚመነጨው?"
+        )
+        k = gr.Slider(1, 10, value=5, step=1, label="Top-K passages")
+    btn = gr.Button("Run RAG")
+    out_answer  = gr.Markdown(label="Answer")
+    out_retlat  = gr.Markdown(label="Retrieval latency")
+    out_genlat  = gr.Markdown(label="Generator latency")
+    out_passage = gr.Textbox(label="Retrieved passage snippet", lines=10)
+    btn.click(
+        gradio_rag,
+        inputs=[query, k],
+        outputs=[out_answer, out_retlat, out_genlat, out_passage],
+    )
+app.launch()