Spaces:

Rishitha3
/

HyDE

Runtime error

App Files Files Community

Rishitha3 commited on Aug 25, 2025

Commit

9bf28bf

verified ·

1 Parent(s): 2a6a2d5

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -149

app.py CHANGED Viewed

@@ -1,158 +1,96 @@
 import gradio as gr
-import fitz  # PyMuPDF
-import re
-import numpy as np
 import faiss
 import os
 from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-from huggingface_hub import login
-# -----------------------------
-# PDF Text Loader
-# -----------------------------
-def load_pdf_text(file_obj):
-    doc = fitz.open(stream=file_obj.read(), filetype="pdf")
     text = ""
-    for page in doc:
-        text += page.get_text()
-    if not text.strip():
-        raise ValueError("No text found in PDF.")
     return text
-# -----------------------------
-# Chunk Text
-# -----------------------------
-def chunk_text(text, max_tokens=200):
-    sentences = re.split(r'(?<=[.!?]) +', text)
-    chunks, current_chunk = [], []
-    current_len = 0
-    for sentence in sentences:
-        word_count = len(sentence.split())
-        if current_len + word_count > max_tokens:
-            chunks.append(" ".join(current_chunk))
-            current_chunk = [sentence]
-            current_len = word_count
-        else:
-            current_chunk.append(sentence)
-            current_len += word_count
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
-    return chunks
-# -----------------------------
-# Simple Vector Store
-# -----------------------------
-class SimpleVectorStore:
-    def __init__(self, dim):
-        self.dim = dim
-        self.vectors = []
-        self.metadata = []
-        self.index = None
-    def add(self, vectors, metas):
-        for v, m in zip(vectors, metas):
-            vec = np.array(v, dtype=np.float32)
-            self.vectors.append(vec)
-            self.metadata.append(m)
-        if self.vectors:
-            self.index = faiss.IndexFlatL2(self.dim)
-            self.index.add(np.stack(self.vectors))
-    def search(self, query_vector, k=5):
-        query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)
-        D, I = self.index.search(query_vector, k)
-        results = [self.metadata[i] for i in I[0]]
-        return results
-# -----------------------------
-# Index PDF
-# -----------------------------
-def index_pdf(file_obj):
-    text = load_pdf_text(file_obj)
-    chunks = chunk_text(text)
-    embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-    vectors = embed_model.encode(chunks)
-    store = SimpleVectorStore(dim=vectors.shape[1])
-    store.add(vectors, chunks)
-    return embed_model, store
-# -----------------------------
-# Load LLaMA Model
-# -----------------------------
-def load_llm():
-    model_id = "meta-llama/Llama-3.2-3b-instruct"
-    hf_token = os.getenv("HF_TOKEN")
-    if not hf_token:
-        raise ValueError("HF_TOKEN is not set. Please add it in Hugging Face Secrets.")
-    login(hf_token)
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
-    llm = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        device_map="auto",
-        torch_dtype=torch.float16,
-        token=hf_token
-    )
-    return tokenizer, llm
-# -----------------------------
-# HyDE + Answer Query
-# -----------------------------
-def answer_query(file_obj, question):
-    try:
-        embed_model, store = index_pdf(file_obj)
-        tokenizer, llm = load_llm()
-        # ---- Step 1: HyDE hypothetical answer ----
-        hyde_prompt = f"""
-        [INST] Write a detailed hypothetical answer to this question:
-        {question}
-        Answer: [/INST]
-        """
-        inputs = tokenizer(hyde_prompt, return_tensors="pt").to(llm.device)
-        hyde_out = llm.generate(**inputs, max_new_tokens=200)
-        hypo_answer = tokenizer.decode(hyde_out[0], skip_special_tokens=True)
-        # ---- Step 2: Embed hypothetical answer ----
-        query_vec = embed_model.encode([hypo_answer])[0]
-        # ---- Step 3: Retrieve top chunks ----
-        relevant_chunks = store.search(query_vec, k=5)
-        context = "\n".join(relevant_chunks)
-        # ---- Step 4: Final Answer ----
-        final_prompt = f"""
-        [INST] You are a helpful tutor. Based only on the context below, answer the question.
-        If context does not have the info, say "I could not find this in the text."
-        Context:
-        {context}
-        Question: {question}
-        Answer: [/INST]
-        """
-        inputs = tokenizer(final_prompt, return_tensors="pt", truncation=True).to(llm.device)
-        outputs = llm.generate(**inputs, max_new_tokens=300, temperature=0.7, top_p=0.9, do_sample=True)
-        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        if "Answer:" in answer:
-            answer = answer.split("Answer:")[-1].strip()
-        return answer
-    except Exception as e:
-        return f"⚠️ Error: {e}"
-# -----------------------------
-# Gradio UI
-# -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 📚 HyDE RAG Chatbot (PDF Tutor)")
-    file_input = gr.File(label="Upload PDF", type="filepath")
-    question = gr.Textbox(label="Ask a Question")
-    answer = gr.Textbox(label="Answer", interactive=False)
-    btn = gr.Button("Get Answer")
-    btn.click(fn=answer_query, inputs=[file_input, question], outputs=answer)
-demo.launch()

 import gradio as gr
+import fitz  # PyMuPDF for PDFs
+import docx
 import faiss
 import os
 from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+# 1. Load embedding + QA model
+embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+qa_model = pipeline("text-generation", model="gpt2")  # Replace with a better model if GPU is available
+# 2. Helper: extract text from files
+def extract_text(file):
     text = ""
+    if file.name.endswith(".pdf"):
+        doc = fitz.open(file.name)
+        for page in doc:
+            text += page.get_text("text")
+    elif file.name.endswith(".docx"):
+        doc = docx.Document(file.name)
+        for para in doc.paragraphs:
+            text += para.text + "\n"
+    else:  # fallback: txt
+        text = file.read().decode("utf-8", errors="ignore")
     return text
+# 3. Helper: create FAISS index
+def build_faiss(text, chunk_size=500, overlap=50):
+    # Split text into chunks with overlap
+    chunks = []
+    for i in range(0, len(text), chunk_size - overlap):
+        chunks.append(text[i:i + chunk_size])
+    # Embed chunks
+    embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
+    # Store in FAISS
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    return index, chunks
+# Global storage
+doc_index = None
+doc_chunks = None
+# 4. Process uploaded file
+def upload_file(file):
+    global doc_index, doc_chunks
+    text = extract_text(file)
+    doc_index, doc_chunks = build_faiss(text)
+    return "✅ Document indexed with HyDE! You can now ask questions."
+# 5. HyDE RAG answering
+def answer_query(query):
+    global doc_index, doc_chunks
+    if doc_index is None:
+        return "⚠️ Please upload a document first."
+    # Step 1: Generate hypothetical answer (HyDE step)
+    hyde_prompt = f"Write a detailed, hypothetical answer to the question:\n\nQuestion: {query}\nAnswer:"
+    hypo_answer = qa_model(hyde_prompt, max_length=150, num_return_sequences=1)[0]["generated_text"]
+    # Step 2: Embed the hypothetical answer instead of the raw query
+    q_emb = embedding_model.encode([hypo_answer], convert_to_numpy=True)
+    # Step 3: Retrieve top 3 most relevant chunks
+    D, I = doc_index.search(q_emb, k=3)
+    retrieved = [doc_chunks[i] for i in I[0]]
+    # Step 4: Build final prompt with context
+    context = "\n\n".join(retrieved)
+    final_prompt = f"Answer the question based on the context:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
+    # Step 5: Generate final response
+    response = qa_model(final_prompt, max_length=200, num_return_sequences=1)[0]["generated_text"]
+    return response
+# 6. Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("## 📚 HyDE RAG Chatbot (Chat with Any Document)")
+    with gr.Row():
+        file_input = gr.File(label="Upload Document", type="filepath")
+        upload_btn = gr.Button("Index Document")
+    status = gr.Textbox(label="Status")
+    query = gr.Textbox(label="Ask a Question")
+    answer = gr.Textbox(label="Answer")
+    ask_btn = gr.Button("Get Answer")
+    upload_btn.click(upload_file, inputs=file_input, outputs=status)
+    ask_btn.click(answer_query, inputs=query, outputs=answer)
+demo.launch()