simran40 commited on
Commit
45001af
Β·
verified Β·
1 Parent(s): a74d897

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -32
app.py CHANGED
@@ -12,16 +12,23 @@ from transformers import pipeline
12
  # MODEL LOADING (ONCE)
13
  # =================================================
14
 
15
- # Embedding model for semantic search
16
  embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
17
 
18
- # Extractive Question Answering model (HIGH ACCURACY)
19
  qa_pipeline = pipeline(
20
  "question-answering",
21
  model="deepset/roberta-base-squad2",
22
  tokenizer="deepset/roberta-base-squad2"
23
  )
24
 
 
 
 
 
 
 
 
25
 
26
  # =================================================
27
  # PDF PROCESSING
@@ -52,6 +59,16 @@ def chunk_text(text, chunk_size=350, overlap=80):
52
  return chunks
53
 
54
 
 
 
 
 
 
 
 
 
 
 
55
  # =================================================
56
  # VECTOR DATABASE (FAISS)
57
  # =================================================
@@ -59,10 +76,8 @@ def chunk_text(text, chunk_size=350, overlap=80):
59
  def build_faiss_index(chunks):
60
  embeddings = embedding_model.encode(chunks)
61
  embeddings = np.array(embeddings).astype("float32")
62
-
63
  index = faiss.IndexFlatL2(embeddings.shape[1])
64
  index.add(embeddings)
65
-
66
  return index, chunks
67
 
68
 
@@ -74,13 +89,12 @@ def retrieve_relevant_chunks(question, index, chunks, top_k=5):
74
  for i, idx in enumerate(indices[0]):
75
  results.append((chunks[idx], distances[0][i]))
76
 
77
- # sort by relevance
78
  results.sort(key=lambda x: x[1])
79
  return [r[0] for r in results]
80
 
81
 
82
  # =================================================
83
- # ANSWER GENERATION (ACCURATE)
84
  # =================================================
85
 
86
  def generate_answer(question, context_chunks):
@@ -104,12 +118,32 @@ def generate_answer(question, context_chunks):
104
 
105
 
106
  # =================================================
107
- # MAIN PIPELINE
108
  # =================================================
109
 
110
- def pdf_qa_chat(pdf_file, question):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  if pdf_file is None or question.strip() == "":
112
- return "Please upload a PDF and enter a valid question."
113
 
114
  text = extract_text_from_pdf(pdf_file.name)
115
  text = clean_text(text)
@@ -118,53 +152,54 @@ def pdf_qa_chat(pdf_file, question):
118
  index, chunks = build_faiss_index(chunks)
119
 
120
  relevant_chunks = retrieve_relevant_chunks(question, index, chunks)
121
- answer = generate_answer(question, relevant_chunks)
 
122
 
123
- return answer
 
 
 
 
 
 
 
 
124
 
125
 
126
  # =================================================
127
- # GRADIO UI
128
  # =================================================
129
 
130
  with gr.Blocks() as demo:
131
 
132
  gr.Markdown("""
133
- # πŸ“„ PDF Question Answering System (High Accuracy)
134
 
135
- Upload a **PDF document** and ask a **specific question**.
136
- The system uses **semantic retrieval + extractive AI**, ensuring
137
- **accurate answers directly from the document** (no hallucination).
138
 
139
- ---
140
  """)
141
 
142
  with gr.Row():
143
  with gr.Column(scale=1):
144
- pdf_input = gr.File(
145
- label="πŸ“€ Upload PDF",
146
- file_types=[".pdf"]
147
- )
148
 
149
  question_input = gr.Textbox(
150
- label="❓ Ask your question",
151
  placeholder="e.g. Whose report is this?",
152
  lines=2
153
  )
154
 
155
- submit_btn = gr.Button("πŸ” Get Answer")
 
156
 
157
  with gr.Column(scale=2):
158
- answer_output = gr.Textbox(
159
- label="πŸ“Œ Answer",
160
- lines=6
161
- )
162
 
163
- submit_btn.click(
164
- fn=pdf_qa_chat,
165
- inputs=[pdf_input, question_input],
166
- outputs=answer_output
167
- )
168
 
169
  gr.Markdown("""
170
  ---
 
12
  # MODEL LOADING (ONCE)
13
  # =================================================
14
 
15
+ # Embedding model for semantic retrieval
16
  embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
17
 
18
+ # Extractive QA model (accurate answers)
19
  qa_pipeline = pipeline(
20
  "question-answering",
21
  model="deepset/roberta-base-squad2",
22
  tokenizer="deepset/roberta-base-squad2"
23
  )
24
 
25
+ # Summarization model (clean summary)
26
+ summarizer = pipeline(
27
+ "summarization",
28
+ model="facebook/bart-large-cnn",
29
+ tokenizer="facebook/bart-large-cnn"
30
+ )
31
+
32
 
33
  # =================================================
34
  # PDF PROCESSING
 
59
  return chunks
60
 
61
 
62
+ def chunk_text_for_summary(text, chunk_size=900, overlap=100):
63
+ chunks = []
64
+ start = 0
65
+ while start < len(text):
66
+ end = start + chunk_size
67
+ chunks.append(text[start:end])
68
+ start = end - overlap
69
+ return chunks
70
+
71
+
72
  # =================================================
73
  # VECTOR DATABASE (FAISS)
74
  # =================================================
 
76
  def build_faiss_index(chunks):
77
  embeddings = embedding_model.encode(chunks)
78
  embeddings = np.array(embeddings).astype("float32")
 
79
  index = faiss.IndexFlatL2(embeddings.shape[1])
80
  index.add(embeddings)
 
81
  return index, chunks
82
 
83
 
 
89
  for i, idx in enumerate(indices[0]):
90
  results.append((chunks[idx], distances[0][i]))
91
 
 
92
  results.sort(key=lambda x: x[1])
93
  return [r[0] for r in results]
94
 
95
 
96
  # =================================================
97
+ # QUESTION ANSWERING (ACCURATE)
98
  # =================================================
99
 
100
  def generate_answer(question, context_chunks):
 
118
 
119
 
120
  # =================================================
121
+ # SUMMARIZATION
122
  # =================================================
123
 
124
+ def generate_summary(chunks):
125
+ summaries = []
126
+
127
+ for chunk in chunks:
128
+ summary = summarizer(
129
+ chunk,
130
+ max_length=150,
131
+ min_length=60,
132
+ do_sample=False
133
+ )[0]["summary_text"]
134
+
135
+ summaries.append(summary)
136
+
137
+ return " ".join(summaries)
138
+
139
+
140
+ # =================================================
141
+ # MAIN FUNCTIONS
142
+ # =================================================
143
+
144
+ def pdf_qa(pdf_file, question):
145
  if pdf_file is None or question.strip() == "":
146
+ return "Please upload a PDF and ask a question."
147
 
148
  text = extract_text_from_pdf(pdf_file.name)
149
  text = clean_text(text)
 
152
  index, chunks = build_faiss_index(chunks)
153
 
154
  relevant_chunks = retrieve_relevant_chunks(question, index, chunks)
155
+ return generate_answer(question, relevant_chunks)
156
+
157
 
158
+ def pdf_summary(pdf_file):
159
+ if pdf_file is None:
160
+ return "Please upload a PDF document."
161
+
162
+ text = extract_text_from_pdf(pdf_file.name)
163
+ text = clean_text(text)
164
+
165
+ chunks = chunk_text_for_summary(text)
166
+ return generate_summary(chunks)
167
 
168
 
169
  # =================================================
170
+ # GRADIO UI (QA + SUMMARY)
171
  # =================================================
172
 
173
  with gr.Blocks() as demo:
174
 
175
  gr.Markdown("""
176
+ # πŸ“„ PDF Question Answering & Summarization System
177
 
178
+ This system supports **two functionalities**:
179
+ - πŸ” **Ask Questions** (Accurate answers from PDF)
180
+ - πŸ“ **Generate Summary** (Concise document summary)
181
 
182
+ Built using **RAG architecture with open-source AI models**.
183
  """)
184
 
185
  with gr.Row():
186
  with gr.Column(scale=1):
187
+ pdf_input = gr.File(label="πŸ“€ Upload PDF", file_types=[".pdf"])
 
 
 
188
 
189
  question_input = gr.Textbox(
190
+ label="❓ Ask a question (for Q&A)",
191
  placeholder="e.g. Whose report is this?",
192
  lines=2
193
  )
194
 
195
+ qa_btn = gr.Button("πŸ” Get Answer")
196
+ summary_btn = gr.Button("πŸ“ Generate Summary")
197
 
198
  with gr.Column(scale=2):
199
+ output_box = gr.Textbox(label="πŸ“Œ Output", lines=12)
 
 
 
200
 
201
+ qa_btn.click(pdf_qa, [pdf_input, question_input], output_box)
202
+ summary_btn.click(pdf_summary, [pdf_input], output_box)
 
 
 
203
 
204
  gr.Markdown("""
205
  ---