import os os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1" import gradio as gr import torch import faiss import numpy as np import re from pypdf import PdfReader from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer, AutoModelForSeq2SeqLM embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") model_name = "google/flan-t5-base" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) index = None chunks = [] def split_text(text, chunk_size=600, overlap=100): pieces = [] start = 0 while start < len(text): end = start + chunk_size chunk = text[start:end] # Remove numeric-only chunks if len(chunk.strip()) > 50 and not chunk.strip().isdigit(): pieces.append(chunk) start = end - overlap return pieces def clean_text(text): # Remove References section completely text = re.split(r'References|REFERENCES', text)[0] # Remove arXiv IDs text = re.sub(r'arXiv:\d+\.\d+', '', text) # Remove citation numbers like [12] text = re.sub(r'\[\d+\]', '', text) # Remove lines that are mostly numbers text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE) return text def process_pdf(file): global index, chunks if file is None: return "Please upload a PDF." reader = PdfReader(file) full_text = "" for page in reader.pages: text = page.extract_text() if text: full_text += text if full_text.strip() == "": return "PDF has no extractable text." full_text = clean_text(full_text) chunks = split_text(full_text) embeddings = embedder.encode(chunks) embeddings = np.array(embeddings).astype("float32") dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) return "PDF processed successfully! Ask your question." def ask_question(question): global index, chunks if index is None: return "Please process a PDF first." question_embedding = embedder.encode([question]).astype("float32") D, I = index.search(question_embedding, k=3) retrieved_chunks = [chunks[i] for i in I[0]] context = "\n\n".join(retrieved_chunks) prompt = f""" You are a research assistant. Explain clearly what the paper is about. Answer in 3-5 complete sentences. Do not include citations or reference numbers. If unclear, say the document does not clearly specify. Context: {context} Question: {question} Answer: """ inputs = tokenizer(prompt, return_tensors="pt", truncation=True) outputs = model.generate( **inputs, max_new_tokens=250, temperature=0.5 ) answer = tokenizer.decode(outputs[0], skip_special_tokens=True) return answer.strip() with gr.Blocks() as demo: gr.Markdown("# 📚 Clean RAG Paper QA") file_input = gr.File(label="Upload Research PDF", file_types=[".pdf"]) process_btn = gr.Button("Process PDF") status_output = gr.Textbox(label="Status") question_input = gr.Textbox(label="Ask a Question") ask_btn = gr.Button("Get Answer") answer_output = gr.Textbox(label="Answer") process_btn.click(process_pdf, inputs=file_input, outputs=status_output) ask_btn.click(ask_question, inputs=question_input, outputs=answer_output) demo.launch()