BasitAliii commited on
Commit
dec4eb8
Β·
verified Β·
1 Parent(s): cca0ee6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -63
app.py CHANGED
@@ -24,23 +24,21 @@ for pkg in ["punkt", "punkt_tab"]:
24
  # ==========================================================
25
  DEVICE = -1 # CPU (-1), use 0 for GPU if available
26
  SUMMARIZER_MODEL = "facebook/bart-large-cnn"
27
- QA_MODEL = "deepset/roberta-base-squad2"
28
 
29
- print("Loading models... please wait ⏳")
30
 
31
  try:
32
  summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
33
- qa_pipeline = pipeline("question-answering", model=QA_MODEL, device=DEVICE)
34
  except Exception as e:
35
  print("Model load error:", e)
36
  summarizer = None
37
- qa_pipeline = None
38
 
39
 
40
  # ==========================================================
41
  # 🧩 Utility Functions
42
  # ==========================================================
43
  def clean_text(text: str) -> str:
 
44
  text = re.sub(r'\r\n?', '\n', text)
45
  text = re.sub(r'\n{2,}', '\n\n', text)
46
  text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE)
@@ -50,6 +48,7 @@ def clean_text(text: str) -> str:
50
 
51
 
52
  def extract_text_from_pdf(path: str) -> str:
 
53
  try:
54
  text = ""
55
  with pdfplumber.open(path) as pdf:
@@ -63,10 +62,12 @@ def extract_text_from_pdf(path: str) -> str:
63
 
64
 
65
  def sentence_tokenize(text: str):
 
66
  return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10]
67
 
68
 
69
  def chunk_text(text: str, max_chars=1500):
 
70
  sents = sentence_tokenize(text)
71
  chunks, cur = [], ""
72
  for s in sents:
@@ -81,6 +82,7 @@ def chunk_text(text: str, max_chars=1500):
81
 
82
 
83
  def extract_keywords_tfidf(text: str, top_k=8):
 
84
  try:
85
  paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0]
86
  vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
@@ -97,10 +99,14 @@ def extract_keywords_tfidf(text: str, top_k=8):
97
  # ✍️ Adaptive Summarization
98
  # ==========================================================
99
  def summarize_long_text(text: str) -> str:
 
100
  if summarizer is None:
101
  return "Summarization model unavailable."
 
102
  text = clean_text(text)
103
  L = len(text)
 
 
104
  if L < 1500:
105
  max_len, min_len, chunk_size = 180, 60, 1400
106
  elif L < 5000:
@@ -130,6 +136,7 @@ def summarize_long_text(text: str) -> str:
130
  # πŸ”Š Text-to-Speech
131
  # ==========================================================
132
  def text_to_speech(text):
 
133
  if not text:
134
  return None
135
  try:
@@ -140,56 +147,34 @@ def text_to_speech(text):
140
  return None
141
 
142
 
143
- # ==========================================================
144
- # 🧠 Q&A
145
- # ==========================================================
146
- def generate_auto_questions(text: str, n=5):
147
- sents = sentence_tokenize(text)
148
- qs = []
149
- for s in sents[:n]:
150
- words = s.split()
151
- if len(words) > 5:
152
- qs.append(f"What is meant by: '{' '.join(words[:8])}...'?")
153
- return qs
154
-
155
-
156
- def answer_question(question, context):
157
- if qa_pipeline is None or not context:
158
- return "Q&A model unavailable or no context."
159
- try:
160
- res = qa_pipeline(question=question, context=context)
161
- return res.get("answer", "No answer found.")
162
- except Exception:
163
- return "Error while generating answer."
164
-
165
-
166
  # ==========================================================
167
  # πŸ“„ PDF Handler
168
  # ==========================================================
169
  def process_pdf(pdf_file):
 
170
  if not pdf_file:
171
- return "Please upload a PDF.", "", None, "", ""
172
 
173
  text = extract_text_from_pdf(pdf_file)
174
  if text.startswith("Error") or text.startswith("No text"):
175
- return text, "", None, "", ""
176
 
177
  text = clean_text(text)
178
  summary = summarize_long_text(text)
179
  keywords = ", ".join(extract_keywords_tfidf(text))
180
  audio = text_to_speech(summary)
181
- auto_qs = "\n".join(generate_auto_questions(text, n=6))
182
 
183
- return text, summary, audio, keywords, auto_qs
184
 
185
 
186
  # ==========================================================
187
  # 🎨 Gradio UI
188
  # ==========================================================
189
- with gr.Blocks(title="AI PDF Assistant", theme=gr.themes.Soft()) as demo:
190
- gr.Markdown("# πŸ“˜ AI PDF Assistant β€” Smart Chat & Summarizer")
191
- gr.Markdown("Easily extract, summarize, and chat with your PDFs using AI.")
192
 
 
193
  with gr.Tab("πŸ“„ Analyze PDF"):
194
  with gr.Row():
195
  with gr.Column(scale=1):
@@ -201,41 +186,27 @@ with gr.Blocks(title="AI PDF Assistant", theme=gr.themes.Soft()) as demo:
201
  audio_box = gr.Audio(label="Summary Audio", interactive=False)
202
  keywords_box = gr.Textbox(label="Top Keywords", lines=2, interactive=False)
203
 
204
- with gr.Tab("πŸ’¬ Chat with PDF"):
205
- gr.Markdown("### Auto-Generated Questions")
206
- auto_q_box = gr.Textbox(label="Generated Questions", lines=6, interactive=False, placeholder="Questions will appear after PDF is processed.")
207
- gr.Markdown("### Ask Your Own Question")
208
- user_q = gr.Textbox(label="Your Question", placeholder="Type your question here...")
209
- ask_btn = gr.Button("Ask", variant="primary")
210
- answer_box = gr.Textbox(label="Answer", lines=4, interactive=False)
211
-
212
  with gr.Tab("ℹ️ About"):
213
  gr.Markdown("""
214
- ## πŸ“˜ About AI PDF Assistant
215
- **AI PDF Assistant** helps you understand and interact with PDFs effortlessly.
216
-
217
- ### Features
218
- - Extracts and cleans text
219
- - Generates adaptive summaries
220
- - Identifies keywords
221
- - Creates audio summaries
222
- - Auto-generates Q&A
223
- - Lets you chat with your PDF content
224
-
225
- Built with ❀️ using Hugging Face Transformers, gTTS, and Gradio.
226
  """)
227
 
 
228
  process_btn.click(
229
  process_pdf,
230
  inputs=[pdf_input],
231
- outputs=[extracted_text, summary_box, audio_box, keywords_box, auto_q_box],
232
- )
233
-
234
- ask_btn.click(
235
- answer_question,
236
- inputs=[user_q, extracted_text],
237
- outputs=[answer_box],
238
  )
239
 
240
- print("πŸš€ Launching AI PDF Assistant...")
241
- demo.launch()
 
24
  # ==========================================================
25
  DEVICE = -1 # CPU (-1), use 0 for GPU if available
26
  SUMMARIZER_MODEL = "facebook/bart-large-cnn"
 
27
 
28
+ print("Loading summarization model... please wait ⏳")
29
 
30
  try:
31
  summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
 
32
  except Exception as e:
33
  print("Model load error:", e)
34
  summarizer = None
 
35
 
36
 
37
  # ==========================================================
38
  # 🧩 Utility Functions
39
  # ==========================================================
40
  def clean_text(text: str) -> str:
41
+ """Clean extracted PDF text."""
42
  text = re.sub(r'\r\n?', '\n', text)
43
  text = re.sub(r'\n{2,}', '\n\n', text)
44
  text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE)
 
48
 
49
 
50
  def extract_text_from_pdf(path: str) -> str:
51
+ """Extract text from all pages of PDF."""
52
  try:
53
  text = ""
54
  with pdfplumber.open(path) as pdf:
 
62
 
63
 
64
  def sentence_tokenize(text: str):
65
+ """Split text into sentences."""
66
  return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10]
67
 
68
 
69
  def chunk_text(text: str, max_chars=1500):
70
+ """Split text into chunks for summarization."""
71
  sents = sentence_tokenize(text)
72
  chunks, cur = [], ""
73
  for s in sents:
 
82
 
83
 
84
  def extract_keywords_tfidf(text: str, top_k=8):
85
+ """Extract keywords using TF-IDF."""
86
  try:
87
  paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0]
88
  vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
 
99
  # ✍️ Adaptive Summarization
100
  # ==========================================================
101
  def summarize_long_text(text: str) -> str:
102
+ """Adaptive summarization based on PDF length."""
103
  if summarizer is None:
104
  return "Summarization model unavailable."
105
+
106
  text = clean_text(text)
107
  L = len(text)
108
+
109
+ # Dynamic chunking
110
  if L < 1500:
111
  max_len, min_len, chunk_size = 180, 60, 1400
112
  elif L < 5000:
 
136
  # πŸ”Š Text-to-Speech
137
  # ==========================================================
138
  def text_to_speech(text):
139
+ """Convert text to speech."""
140
  if not text:
141
  return None
142
  try:
 
147
  return None
148
 
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  # ==========================================================
151
  # πŸ“„ PDF Handler
152
  # ==========================================================
153
  def process_pdf(pdf_file):
154
+ """Main handler to process PDF."""
155
  if not pdf_file:
156
+ return "Please upload a PDF.", "", None, ""
157
 
158
  text = extract_text_from_pdf(pdf_file)
159
  if text.startswith("Error") or text.startswith("No text"):
160
+ return text, "", None, ""
161
 
162
  text = clean_text(text)
163
  summary = summarize_long_text(text)
164
  keywords = ", ".join(extract_keywords_tfidf(text))
165
  audio = text_to_speech(summary)
 
166
 
167
+ return text, summary, audio, keywords
168
 
169
 
170
  # ==========================================================
171
  # 🎨 Gradio UI
172
  # ==========================================================
173
+ with gr.Blocks(title="AI PDF Summarizer", theme=gr.themes.Soft()) as demo:
174
+ gr.Markdown("# πŸ“˜ AI PDF Summarizer β€” Extract, Summarize & Listen")
175
+ gr.Markdown("Easily extract and summarize text from PDFs with AI, and listen to audio summaries.")
176
 
177
+ # --- Analyze PDF Tab ---
178
  with gr.Tab("πŸ“„ Analyze PDF"):
179
  with gr.Row():
180
  with gr.Column(scale=1):
 
186
  audio_box = gr.Audio(label="Summary Audio", interactive=False)
187
  keywords_box = gr.Textbox(label="Top Keywords", lines=2, interactive=False)
188
 
189
+ # --- About Tab ---
 
 
 
 
 
 
 
190
  with gr.Tab("ℹ️ About"):
191
  gr.Markdown("""
192
+ ## πŸ“˜ About AI PDF Summarizer
193
+ **AI PDF Summarizer** helps you quickly understand the contents of any PDF using AI.
194
+
195
+ ### ✨ Features
196
+ - Extracts and cleans text from PDFs
197
+ - Creates adaptive, high-quality summaries
198
+ - Identifies key terms and topics using TF-IDF
199
+ - Generates audio summaries for listening convenience
200
+
201
+ Built with ❀️ using **Hugging Face Transformers**, **Gradio**, and **gTTS**.
 
 
202
  """)
203
 
204
+ # --- Event Connections ---
205
  process_btn.click(
206
  process_pdf,
207
  inputs=[pdf_input],
208
+ outputs=[extracted_text, summary_box, audio_box, keywords_box],
 
 
 
 
 
 
209
  )
210
 
211
+ print("πŸš€ Launching AI PDF Summarizer...")
212
+ demo.launch(share=True, debug=True)