akazmi commited on
Commit
f28212f
·
verified ·
1 Parent(s): 92378ab

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PyPDF2 import PdfReader
3
+ from sentence_transformers import SentenceTransformer
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
5
+ import numpy as np
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ # Load embedding model (small and fast)
9
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
10
+
11
+ # Load FLAN-T5 model (CPU-friendly)
12
+ model_name = "google/flan-t5-base"
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
15
+ rag_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
16
+
17
+ # Read PDF text
18
+ def read_pdf(file_path):
19
+ try:
20
+ with open(file_path, "rb") as file:
21
+ reader = PdfReader(file)
22
+ text = ""
23
+ for page in reader.pages:
24
+ page_text = page.extract_text()
25
+ if page_text:
26
+ text += page_text + "\n"
27
+ return text
28
+ except Exception as e:
29
+ return f"Error reading PDF: {str(e)}"
30
+
31
+ # Split into ~500-word chunks
32
+ def chunk_text(text, chunk_size=500):
33
+ words = text.split()
34
+ return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
35
+
36
+ # Find top-k relevant chunks via cosine similarity
37
+ def retrieve_relevant_chunks(question, chunks, top_k=3):
38
+ chunk_embeddings = embedder.encode(chunks)
39
+ question_embedding = embedder.encode([question])
40
+ scores = cosine_similarity(question_embedding, chunk_embeddings)[0]
41
+ top_indices = np.argsort(scores)[-top_k:][::-1]
42
+ return "\n\n".join([chunks[i] for i in top_indices])
43
+
44
+ # Main QA function
45
+ def answer_question(uploaded_file, user_question):
46
+ if uploaded_file is None:
47
+ return "❌ Please upload a PDF file."
48
+
49
+ text = read_pdf(uploaded_file.name)
50
+ if not text or not isinstance(text, str):
51
+ return "❌ Could not extract text from the document."
52
+
53
+ chunks = chunk_text(text)
54
+ if not chunks:
55
+ return "❌ Document too short or empty."
56
+
57
+ context = retrieve_relevant_chunks(user_question, chunks)
58
+
59
+ prompt = f"Context: {context}\n\nQuestion: {user_question}\nAnswer:"
60
+
61
+ try:
62
+ result = rag_pipeline(prompt, max_new_tokens=256)
63
+ return result[0]["generated_text"].strip()
64
+ except Exception as e:
65
+ return f"❌ Error during generation: {str(e)}"
66
+
67
+ # Gradio Interface
68
+ def create_interface():
69
+ with gr.Blocks() as demo:
70
+ gr.Markdown("## 📄 Ask Questions from a PDF Document (RAG using FLAN-T5)")
71
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
72
+ question_input = gr.Textbox(label="Enter your question")
73
+ answer_output = gr.Textbox(label="Answer", lines=10)
74
+
75
+ gr.Button("Ask").click(fn=answer_question, inputs=[file_input, question_input], outputs=[answer_output])
76
+ return demo
77
+
78
+ if __name__ == "__main__":
79
+ demo = create_interface()
80
+ demo.launch()