rag_qa / app.py
vangru's picture
Update app.py
fcb0def verified
import os
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"
import gradio as gr
import torch
import faiss
import numpy as np
import re
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
index = None
chunks = []
def split_text(text, chunk_size=600, overlap=100):
pieces = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
# Remove numeric-only chunks
if len(chunk.strip()) > 50 and not chunk.strip().isdigit():
pieces.append(chunk)
start = end - overlap
return pieces
def clean_text(text):
# Remove References section completely
text = re.split(r'References|REFERENCES', text)[0]
# Remove arXiv IDs
text = re.sub(r'arXiv:\d+\.\d+', '', text)
# Remove citation numbers like [12]
text = re.sub(r'\[\d+\]', '', text)
# Remove lines that are mostly numbers
text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
return text
def process_pdf(file):
global index, chunks
if file is None:
return "Please upload a PDF."
reader = PdfReader(file)
full_text = ""
for page in reader.pages:
text = page.extract_text()
if text:
full_text += text
if full_text.strip() == "":
return "PDF has no extractable text."
full_text = clean_text(full_text)
chunks = split_text(full_text)
embeddings = embedder.encode(chunks)
embeddings = np.array(embeddings).astype("float32")
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return "PDF processed successfully! Ask your question."
def ask_question(question):
global index, chunks
if index is None:
return "Please process a PDF first."
question_embedding = embedder.encode([question]).astype("float32")
D, I = index.search(question_embedding, k=3)
retrieved_chunks = [chunks[i] for i in I[0]]
context = "\n\n".join(retrieved_chunks)
prompt = f"""
You are a research assistant.
Explain clearly what the paper is about.
Answer in 3-5 complete sentences.
Do not include citations or reference numbers.
If unclear, say the document does not clearly specify.
Context:
{context}
Question:
{question}
Answer:
"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
outputs = model.generate(
**inputs,
max_new_tokens=250,
temperature=0.5
)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
return answer.strip()
with gr.Blocks() as demo:
gr.Markdown("# ๐Ÿ“š Clean RAG Paper QA")
file_input = gr.File(label="Upload Research PDF", file_types=[".pdf"])
process_btn = gr.Button("Process PDF")
status_output = gr.Textbox(label="Status")
question_input = gr.Textbox(label="Ask a Question")
ask_btn = gr.Button("Get Answer")
answer_output = gr.Textbox(label="Answer")
process_btn.click(process_pdf, inputs=file_input, outputs=status_output)
ask_btn.click(ask_question, inputs=question_input, outputs=answer_output)
demo.launch()