simran40 commited on
Commit
fcd815e
·
verified ·
1 Parent(s): 413fc1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -57
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF
3
  import re
 
4
  import numpy as np
5
 
6
  from sentence_transformers import SentenceTransformer
@@ -8,14 +9,14 @@ from transformers import pipeline
8
 
9
 
10
  # =================================================
11
- # MODEL LOADING (ONCE AT STARTUP)
12
  # =================================================
13
 
14
- # Embedding model (used for chunk relevance if needed later)
15
  embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
16
 
17
- # Facebook BART summarization model (BEST accuracy on CPU)
18
- summarizer = pipeline(
19
  "summarization",
20
  model="facebook/bart-large-cnn",
21
  tokenizer="facebook/bart-large-cnn"
@@ -41,99 +42,109 @@ def clean_text(text):
41
  return text.strip()
42
 
43
 
44
- def chunk_text(text, chunk_size=900, overlap=100):
45
- """
46
- Larger chunks are better for summarization
47
- """
48
  chunks = []
49
  start = 0
50
- text_length = len(text)
51
-
52
- while start < text_length:
53
  end = start + chunk_size
54
  chunks.append(text[start:end])
55
  start = end - overlap
56
-
57
  return chunks
58
 
59
 
60
  # =================================================
61
- # SUMMARIZATION LOGIC (ACCURATE & CPU SAFE)
62
  # =================================================
63
 
64
- def summarize_chunks(chunks):
65
- summaries = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- for chunk in chunks:
68
- summary = summarizer(
69
- chunk,
70
- max_length=150,
71
- min_length=60,
72
- do_sample=False
73
- )[0]["summary_text"]
74
 
75
- summaries.append(summary)
 
76
 
77
- return " ".join(summaries)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
 
80
  # =================================================
81
  # MAIN PIPELINE
82
  # =================================================
83
 
84
- def pdf_summarizer(pdf_file):
85
- if pdf_file is None:
86
- return "Please upload a PDF document."
87
 
88
- # 1. Extract & clean text
89
- raw_text = extract_text_from_pdf(pdf_file.name)
90
- cleaned_text = clean_text(raw_text)
91
 
92
- # 2. Chunk text
93
- chunks = chunk_text(cleaned_text)
94
 
95
- # 3. Summarize
96
- final_summary = summarize_chunks(chunks)
97
 
98
- return final_summary
99
 
100
 
101
  # =================================================
102
- # GRADIO UI (PRODUCTION READY)
103
  # =================================================
104
 
105
  with gr.Blocks() as demo:
106
 
107
  gr.Markdown("""
108
- # 📄 PDF Summarizer (Open-Source AI)
109
-
110
- Upload a **PDF document** to generate an **accurate, concise summary**.
111
- This system uses **Facebook BART**, a state-of-the-art open-source
112
- summarization model, optimized for **CPU execution**.
113
 
114
- ---
 
 
115
  """)
116
 
117
  with gr.Row():
118
  with gr.Column(scale=1):
119
- pdf_input = gr.File(
120
- label="📤 Upload PDF",
121
- file_types=[".pdf"]
 
 
122
  )
123
-
124
- summarize_btn = gr.Button("📝 Generate Summary")
125
 
126
  with gr.Column(scale=2):
127
- summary_output = gr.Textbox(
128
- label="📌 Summary",
129
- lines=12
130
- )
131
 
132
- summarize_btn.click(
133
- fn=pdf_summarizer,
134
- inputs=[pdf_input],
135
- outputs=summary_output
136
- )
137
 
138
  gr.Markdown("""
139
  ---
 
1
  import gradio as gr
2
+ import fitz
3
  import re
4
+ import faiss
5
  import numpy as np
6
 
7
  from sentence_transformers import SentenceTransformer
 
9
 
10
 
11
  # =================================================
12
+ # MODELS
13
  # =================================================
14
 
15
+ # Embedding model (for retrieval)
16
  embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
17
 
18
+ # BART summarization model (used as answer generator)
19
+ bart = pipeline(
20
  "summarization",
21
  model="facebook/bart-large-cnn",
22
  tokenizer="facebook/bart-large-cnn"
 
42
  return text.strip()
43
 
44
 
45
+ def chunk_text(text, chunk_size=400, overlap=80):
 
 
 
46
  chunks = []
47
  start = 0
48
+ while start < len(text):
 
 
49
  end = start + chunk_size
50
  chunks.append(text[start:end])
51
  start = end - overlap
 
52
  return chunks
53
 
54
 
55
  # =================================================
56
+ # VECTOR SEARCH
57
  # =================================================
58
 
59
+ def build_faiss_index(chunks):
60
+ embeddings = embedding_model.encode(chunks)
61
+ embeddings = np.array(embeddings).astype("float32")
62
+ index = faiss.IndexFlatL2(embeddings.shape[1])
63
+ index.add(embeddings)
64
+ return index, chunks
65
+
66
+
67
+ def retrieve_chunks(question, index, chunks, top_k=3):
68
+ q_emb = embedding_model.encode([question]).astype("float32")
69
+ _, indices = index.search(q_emb, top_k)
70
+ return [chunks[i] for i in indices[0]]
71
+
72
+
73
+ # =================================================
74
+ # QUESTION–ANSWER USING BART
75
+ # =================================================
76
 
77
+ def generate_answer(question, context_chunks):
78
+ context = " ".join(context_chunks)
 
 
 
 
 
79
 
80
+ prompt = f"""
81
+ Answer the following question using ONLY the given context.
82
 
83
+ Context:
84
+ {context}
85
+
86
+ Question:
87
+ {question}
88
+ """
89
+
90
+ result = bart(
91
+ prompt,
92
+ max_length=120,
93
+ min_length=30,
94
+ do_sample=False
95
+ )[0]["summary_text"]
96
+
97
+ return result
98
 
99
 
100
  # =================================================
101
  # MAIN PIPELINE
102
  # =================================================
103
 
104
+ def pdf_qa(pdf_file, question):
105
+ if pdf_file is None or question.strip() == "":
106
+ return "Please upload a PDF and ask a question."
107
 
108
+ text = extract_text_from_pdf(pdf_file.name)
109
+ text = clean_text(text)
 
110
 
111
+ chunks = chunk_text(text)
112
+ index, chunks = build_faiss_index(chunks)
113
 
114
+ relevant_chunks = retrieve_chunks(question, index, chunks)
115
+ answer = generate_answer(question, relevant_chunks)
116
 
117
+ return answer
118
 
119
 
120
  # =================================================
121
+ # GRADIO UI
122
  # =================================================
123
 
124
  with gr.Blocks() as demo:
125
 
126
  gr.Markdown("""
127
+ # 📄 PDF Question Answering System (BART Based)
 
 
 
 
128
 
129
+ Upload a **PDF** and ask a **specific question**.
130
+ The system retrieves relevant content and generates a **focused answer**,
131
+ not a full summary.
132
  """)
133
 
134
  with gr.Row():
135
  with gr.Column(scale=1):
136
+ pdf_input = gr.File(label="📤 Upload PDF", file_types=[".pdf"])
137
+ question_input = gr.Textbox(
138
+ label="❓ Ask your question",
139
+ placeholder="e.g. What is the objective of the project?",
140
+ lines=2
141
  )
142
+ btn = gr.Button("🔍 Get Answer")
 
143
 
144
  with gr.Column(scale=2):
145
+ output = gr.Textbox(label="📌 Answer", lines=8)
 
 
 
146
 
147
+ btn.click(pdf_qa, [pdf_input, question_input], output)
 
 
 
 
148
 
149
  gr.Markdown("""
150
  ---