Spaces:

akazmi
/

Documents-Reader-RAG

Sleeping

App Files Files Community

akazmi commited on Jun 24, 2025

Commit

51c2867

verified ·

1 Parent(s): ecc7d10

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -70

app.py CHANGED Viewed

@@ -1,85 +1,56 @@
-import gradio as gr
 import torch
-import pdfplumber
-import re
-import numpy as np
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
-# ===== Load Embedding Model =====
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-# ===== Load QA Model =====
-model_name = "mistralai/Mistral-7B-Instruct-v0.1"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto")
-qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-# ===== Read PDF and Clean =====
-def read_pdf(file_path):
-    try:
-        with pdfplumber.open(file_path) as pdf:
-            return "\n".join(page.extract_text() or "" for page in pdf.pages)
-    except Exception as e:
-        return f"Error reading PDF: {str(e)}"
-# ===== Smart Sentence Chunking =====
-def chunk_text(text, max_len=500):
-    sentences = re.split(r'(?<=[.؟!])\s+', text)
-    chunks, current = [], ""
-    for sentence in sentences:
-        if len(current) + len(sentence) <= max_len:
-            current += sentence + " "
-        else:
-            chunks.append(current.strip())
-            current = sentence + " "
-    if current:
-        chunks.append(current.strip())
-    return chunks
-# ===== Semantic Retrieval =====
-def get_relevant_chunks(question, chunks, top_k=2):
-    q_vec = embedder.encode([question])
-    c_vecs = embedder.encode(chunks)
-    sims = cosine_similarity(q_vec, c_vecs)[0]
-    top_indices = np.argsort(sims)[-top_k:][::-1]
-    return "\n\n".join([chunks[i] for i in top_indices])
-# ===== Generate Answer =====
-def answer_question(file, question):
-    if not file:
-        return "⚠️ Please upload a PDF."
-    if not question.strip():
-        return "⚠️ Please enter a question."
-    raw_text = read_pdf(file.name)
-    if raw_text.startswith("Error"):
-        return raw_text
-    chunks = chunk_text(raw_text)
-    context = get_relevant_chunks(question, chunks)
-    prompt = (
-        f"You are a legal expert. Based on the context below, answer the question in a detailed and explanatory manner.\n\n"
-        f"Context:\n{context}\n\n"
-        f"Question: {question}\n\n"
-        f"Answer:"
-    )
-    try:
-        response = qa_pipeline(prompt, max_new_tokens=300, do_sample=False, temperature=0.3)
-        return response[0]["generated_text"].split("Answer:")[-1].strip()
-    except Exception as e:
-        return f"Error generating answer: {e}"
-# ===== Gradio Interface =====
 with gr.Blocks() as demo:
-    gr.Markdown("## 📘 Document Question Answering (RAG-powered)")
-    file = gr.File(label="Upload PDF", file_types=[".pdf"])
-    question = gr.Textbox(label="Ask a question", placeholder="e.g., Is there any section for cost audit?")
-    answer = gr.Textbox(label="Answer", lines=10)
-    submit = gr.Button("Get Answer")
-    submit.click(fn=answer_question, inputs=[file, question], outputs=answer)
 demo.launch()

+import os
 import torch
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
+import PyPDF2
+# Load LLM and Embedding model
+qa_model = "google/flan-t5-large"
+tokenizer = AutoTokenizer.from_pretrained(qa_model)
+model = AutoModelForSeq2SeqLM.from_pretrained(qa_model)
+qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# Global document store
+documents = []
+document_embeddings = []
+def extract_text(file):
+    reader = PyPDF2.PdfReader(file)
+    return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
+def add_document(file):
+    text = extract_text(file)
+    documents.append(text)
+    document_embeddings.append(embedder.encode(text))
+    return "Document uploaded and indexed successfully."
+def generate_answer(query):
+    if not documents:
+        return "Please upload a document first."
+    query_embedding = embedder.encode(query)
+    similarities = cosine_similarity([query_embedding], document_embeddings)[0]
+    best_match_index = similarities.argmax()
+    relevant_text = documents[best_match_index][:3000]  # Truncate if too long
+    prompt = f"Answer this question based on the context:\n\nContext: {relevant_text}\n\nQuestion: {query}"
+    answer = qa_pipeline(prompt, max_new_tokens=300, temperature=0.3)[0]["generated_text"]
+    return answer.strip()
+# Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("# 📄 Document Reader with RAG (Flan-T5)")
+    file_input = gr.File(label="Upload PDF", type="file")
+    upload_btn = gr.Button("Upload & Index")
+    query = gr.Textbox(label="Ask a question")
+    submit_btn = gr.Button("Get Answer")
+    answer_box = gr.Textbox(label="Answer")
+    upload_btn.click(fn=add_document, inputs=file_input, outputs=answer_box)
+    submit_btn.click(fn=generate_answer, inputs=query, outputs=answer_box)
 demo.launch()