Spaces:

Rishitha3
/

HyDE_RAG

Sleeping

App Files Files Community

Rishitha3 commited on Aug 25, 2025

Commit

3dda9b8

verified ·

1 Parent(s): 4f396da

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -141

app.py CHANGED Viewed

@@ -1,147 +1,92 @@
-import streamlit as st
-import fitz  # PyMuPDF
-import re
-from sentence_transformers import SentenceTransformer
-import numpy as np
 import faiss
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
-from huggingface_hub import login   # ✅ added for token auth
-# -----------------------------
-# PDF Text Loader
-# -----------------------------
-def load_pdf_text(uploaded_file):
-    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
     text = ""
-    for page in doc:
-        text += page.get_text()
-    if not text.strip():
-        raise ValueError("No text found in PDF.")
     return text
-# -----------------------------
-# Chunk Text
-# -----------------------------
-def chunk_text(text, max_tokens=200):
-    sentences = re.split(r'(?<=[.!?]) +', text)
-    chunks, current_chunk = [], []
-    current_len = 0
-    for sentence in sentences:
-        word_count = len(sentence.split())
-        if current_len + word_count > max_tokens:
-            chunks.append(" ".join(current_chunk))
-            current_chunk = [sentence]
-            current_len = word_count
-        else:
-            current_chunk.append(sentence)
-            current_len += word_count
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
-    return chunks
-# -----------------------------
-# Simple Vector Store
-# -----------------------------
-class SimpleVectorStore:
-    def __init__(self, dim):
-        self.dim = dim
-        self.vectors = []
-        self.metadata = []
-        self.index = None
-    def add(self, vectors, metas):
-        for v, m in zip(vectors, metas):
-            vec = np.array(v, dtype=np.float32)
-            self.vectors.append(vec)
-            self.metadata.append(m)
-        if self.vectors:
-            self.index = faiss.IndexFlatL2(self.dim)
-            self.index.add(np.stack(self.vectors))
-    def search(self, query_vector, k=5):
-        query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)
-        D, I = self.index.search(query_vector, k)
-        results = [self.metadata[i] for i in I[0]]
-        return results
-# -----------------------------
-# Index PDF
-# -----------------------------
-def index_pdf(uploaded_file):
-    text = load_pdf_text(uploaded_file)
-    chunks = chunk_text(text)
-    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
-    vectors = embed_model.encode(chunks)
-    store = SimpleVectorStore(dim=vectors.shape[1])
-    store.add(vectors, chunks)
-    return embed_model, store, chunks
-# -----------------------------
-# Load LLaMA Model
-# -----------------------------
-@st.cache_resource
-def load_llm():
-    model_id = "meta-llama/Llama-3.2-3b-instruct"
-    # ✅ Get token from HF secrets
-    hf_token = os.getenv("HF_TOKEN")
-    if not hf_token:
-        raise ValueError("HF_TOKEN is not set. Please add it in Hugging Face Secrets.")
-    login(hf_token)  # ✅ Authenticate with HF Hub
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
-    llm = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        device_map="auto",
-        torch_dtype="auto",
-        token=hf_token   # ✅ Needed to load gated model
-    )
-    return tokenizer, llm
-# -----------------------------
-# Streamlit UI
-# -----------------------------
-st.set_page_config(page_title="Student Assisted Chatbot", page_icon="🤖", layout="wide")
-st.title("🎓 Student Assisted Chatbot")
-st.write("Upload your textbook (PDF) and ask questions about it.")
-uploaded_file = st.file_uploader("Upload PDF", type="pdf")
-user_input = st.text_input("Your question:")
-if uploaded_file and user_input:
-    try:
-        embed_model, store, chunks = index_pdf(uploaded_file)
-        tokenizer, llm = load_llm()
-        query_vec = embed_model.encode([user_input])[0]
-        relevant_chunks = store.search(query_vec, k=5)
-        context = "\n".join(relevant_chunks)
-        prompt = f"""
-[INST] You are a helpful tutor. Based only on the context below, answer the question in complete sentences.
-If the context does not contain enough information, say "I could not find this in the text."
-Context:
-{context}
-Question: {user_input}
-Answer: [/INST]
-"""
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(llm.device)
-        outputs = llm.generate(
-            **inputs,
-            max_new_tokens=300,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True
-        )
-        answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-        if "Answer:" in answer:
-            answer = answer.split("Answer:")[-1].strip()
-        st.write("🧠 Answer")
-        st.write(answer if answer else "Sorry, I couldn’t generate a complete answer.")
-    except Exception as e:
-        st.error(f"Error: {e}")

+import gradio as gr
+import fitz  # PyMuPDF for PDFs
+import docx
 import faiss
 import os
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+# 1. Load embedding + QA model
+embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+qa_model = pipeline("text-generation", model="gpt2")  # Replace with better model if GPU available
+# 2. Helper: extract text from files
+def extract_text(file):
     text = ""
+    if file.name.endswith(".pdf"):
+        doc = fitz.open(file.name)
+        for page in doc:
+            text += page.get_text("text")
+    elif file.name.endswith(".docx"):
+        doc = docx.Document(file.name)
+        for para in doc.paragraphs:
+            text += para.text + "\n"
+    else:  # fallback: txt
+        text = file.read().decode("utf-8", errors="ignore")
     return text
+# 3. Helper: create FAISS index
+def build_faiss(text, chunk_size=500, overlap=50):
+    # Split text into chunks
+    chunks = []
+    for i in range(0, len(text), chunk_size - overlap):
+        chunks.append(text[i:i + chunk_size])
+    # Embed chunks
+    embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
+    # Store in FAISS
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    return index, chunks
+# Global storage
+doc_index = None
+doc_chunks = None
+# 4. Process uploaded file
+def upload_file(file):
+    global doc_index, doc_chunks
+    text = extract_text(file)
+    doc_index, doc_chunks = build_faiss(text)
+    return "✅ Document indexed! You can now ask questions."
+# 5. Answer questions
+def answer_query(query):
+    global doc_index, doc_chunks
+    if doc_index is None:
+        return "⚠️ Please upload a document first."
+    # Embed query
+    q_emb = embedding_model.encode([query], convert_to_numpy=True)
+    # Retrieve top 3
+    D, I = doc_index.search(q_emb, k=3)
+    retrieved = [doc_chunks[i] for i in I[0]]
+    # Build prompt
+    context = "\n\n".join(retrieved)
+    prompt = f"Answer the question based on the context:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
+    # Generate
+    response = qa_model(prompt, max_length=200, num_return_sequences=1)[0]["generated_text"]
+    return response
+# 6. Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## 📚 Chat with Any Document (RAG Demo)")
+    with gr.Row():
+        file_input = gr.File(label="Upload Document", type="filepath")
+        upload_btn = gr.Button("Index Document")
+    status = gr.Textbox(label="Status")
+    query = gr.Textbox(label="Ask a Question")
+    answer = gr.Textbox(label="Answer")
+    ask_btn = gr.Button("Get Answer")
+    upload_btn.click(upload_file, inputs=file_input, outputs=status)
+    ask_btn.click(answer_query, inputs=query, outputs=answer)
+demo.launch()