BasitAliii commited on
Commit
ddec509
Β·
verified Β·
1 Parent(s): 108f6c2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +247 -0
app.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import tempfile
4
+ from datetime import datetime
5
+ import gradio as gr
6
+ from transformers import pipeline
7
+ import pdfplumber
8
+ from gtts import gTTS
9
+ import nltk
10
+ import numpy as np
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+
13
+ # ==========================================================
14
+ # 🧠 NLTK Setup (Fix for punkt_tab)
15
+ # ==========================================================
16
+ for pkg in ["punkt", "punkt_tab"]:
17
+ try:
18
+ nltk.data.find(f"tokenizers/{pkg}")
19
+ except LookupError:
20
+ nltk.download(pkg)
21
+
22
+ # ==========================================================
23
+ # βš™οΈ Model Setup
24
+ # ==========================================================
25
+ DEVICE = -1 # CPU (-1), 0 for GPU if available
26
+ SUMMARIZER_MODEL = "facebook/bart-large-cnn"
27
+ QA_MODEL = "deepset/roberta-base-squad2"
28
+
29
+ print("Loading models... please wait ⏳")
30
+
31
+ try:
32
+ summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=DEVICE)
33
+ qa_pipeline = pipeline("question-answering", model=QA_MODEL, device=DEVICE)
34
+ except Exception as e:
35
+ print("Model load error:", e)
36
+ summarizer = None
37
+ qa_pipeline = None
38
+
39
+
40
+ # ==========================================================
41
+ # 🧩 Utility Functions
42
+ # ==========================================================
43
+ def clean_text(text: str) -> str:
44
+ text = re.sub(r'\r\n?', '\n', text)
45
+ text = re.sub(r'\n{2,}', '\n\n', text)
46
+ text = re.sub(r'References[\s\S]*', '', text, flags=re.IGNORECASE)
47
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
48
+ text = re.sub(r'\s+', ' ', text)
49
+ return text.strip()
50
+
51
+
52
+ def extract_text_from_pdf(path: str) -> str:
53
+ try:
54
+ text = ""
55
+ with pdfplumber.open(path) as pdf:
56
+ for page in pdf.pages:
57
+ page_text = page.extract_text()
58
+ if page_text:
59
+ text += page_text + "\n\n"
60
+ return text.strip() if text.strip() else "No text extracted from PDF."
61
+ except Exception as e:
62
+ return f"Error extracting text: {e}"
63
+
64
+
65
+ def sentence_tokenize(text: str):
66
+ return [s.strip() for s in nltk.tokenize.sent_tokenize(text) if len(s.strip()) > 10]
67
+
68
+
69
+ def chunk_text(text: str, max_chars=1500):
70
+ sents = sentence_tokenize(text)
71
+ chunks, cur = [], ""
72
+ for s in sents:
73
+ if len(cur) + len(s) < max_chars:
74
+ cur += (" " if cur else "") + s
75
+ else:
76
+ chunks.append(cur)
77
+ cur = s
78
+ if cur:
79
+ chunks.append(cur)
80
+ return chunks
81
+
82
+
83
+ def extract_keywords_tfidf(text: str, top_k=8):
84
+ try:
85
+ paras = [p.strip() for p in re.split(r'\n{2,}', text) if len(p.strip()) > 0]
86
+ vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
87
+ X = vectorizer.fit_transform(paras)
88
+ features = vectorizer.get_feature_names_out()
89
+ scores = np.asarray(X.mean(axis=0)).ravel()
90
+ idx = np.argsort(scores)[::-1][:top_k]
91
+ return [features[i] for i in idx]
92
+ except Exception:
93
+ return []
94
+
95
+
96
+ # ==========================================================
97
+ # ✍️ Summarization
98
+ # ==========================================================
99
+ def summarize_long_text(text: str) -> str:
100
+ if summarizer is None:
101
+ return "Summarization model unavailable."
102
+
103
+ text = clean_text(text)
104
+ L = len(text)
105
+
106
+ if L < 1500:
107
+ max_len, min_len, chunk_size = 180, 60, 1400
108
+ elif L < 5000:
109
+ max_len, min_len, chunk_size = 250, 100, 1600
110
+ elif L < 15000:
111
+ max_len, min_len, chunk_size = 350, 150, 1800
112
+ else:
113
+ max_len, min_len, chunk_size = 500, 200, 2000
114
+
115
+ if L <= chunk_size:
116
+ return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
117
+
118
+ parts = chunk_text(text, max_chars=chunk_size)[:6]
119
+ summaries = []
120
+ for p in parts:
121
+ try:
122
+ summaries.append(summarizer(p, max_length=200, min_length=80, do_sample=False)[0]["summary_text"])
123
+ except Exception:
124
+ continue
125
+
126
+ combined = " ".join(summaries)
127
+ final = summarizer(combined, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
128
+ return final
129
+
130
+
131
+ # ==========================================================
132
+ # πŸ”Š Text to Speech
133
+ # ==========================================================
134
+ def text_to_speech(text):
135
+ if not text:
136
+ return None
137
+ try:
138
+ t = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
139
+ gTTS(text=text[:900], lang="en").save(t.name)
140
+ return t.name
141
+ except Exception:
142
+ return None
143
+
144
+
145
+ # ==========================================================
146
+ # πŸ’¬ Q&A Generation
147
+ # ==========================================================
148
+ def generate_auto_questions(text: str, n=5):
149
+ sents = sentence_tokenize(text)
150
+ qs = []
151
+ for s in sents[:n]:
152
+ words = s.split()
153
+ if len(words) > 5:
154
+ qs.append(f"What is meant by: '{' '.join(words[:8])}...'?")
155
+ return qs
156
+
157
+
158
+ def answer_question(question, context):
159
+ if qa_pipeline is None or not context:
160
+ return "Q&A model unavailable or no context."
161
+ try:
162
+ res = qa_pipeline(question=question, context=context)
163
+ return res.get("answer", "No answer found.")
164
+ except Exception:
165
+ return "Error while generating answer."
166
+
167
+
168
+ # ==========================================================
169
+ # πŸ“„ PDF Handler
170
+ # ==========================================================
171
+ def process_pdf(pdf_file):
172
+ if not pdf_file:
173
+ return "Please upload a PDF.", "", None, "", ""
174
+
175
+ text = extract_text_from_pdf(pdf_file)
176
+ if text.startswith("Error") or text.startswith("No text"):
177
+ return text, "", None, "", ""
178
+
179
+ text = clean_text(text)
180
+ summary = summarize_long_text(text)
181
+ keywords = ", ".join(extract_keywords_tfidf(text))
182
+ audio = text_to_speech(summary)
183
+ auto_qs = "\n".join(generate_auto_questions(text, n=6))
184
+
185
+ return text, summary, audio, keywords, auto_qs
186
+
187
+
188
+ # ==========================================================
189
+ # 🎨 Gradio Interface
190
+ # ==========================================================
191
+ with gr.Blocks(title="AI PDF Assistant", theme=gr.themes.Soft()) as demo:
192
+ gr.Markdown("# πŸ“˜ AI PDF Assistant β€” Smart Chat & Summarizer")
193
+ gr.Markdown("Easily extract, summarize, and chat with your PDFs using AI.")
194
+
195
+ # --- Analyze PDF Tab ---
196
+ with gr.Tab("πŸ“„ Analyze PDF"):
197
+ with gr.Row():
198
+ with gr.Column(scale=1):
199
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], type="filepath")
200
+ process_btn = gr.Button("πŸš€ Process PDF", variant="primary")
201
+ with gr.Column(scale=2):
202
+ extracted_text = gr.Textbox(label="Extracted Text", lines=8, interactive=False)
203
+ summary_box = gr.Textbox(label="Summary", lines=6, interactive=False)
204
+ audio_box = gr.Audio(label="Summary Audio", interactive=False)
205
+ keywords_box = gr.Textbox(label="Top Keywords", lines=2, interactive=False)
206
+
207
+ # --- Chat with PDF Tab ---
208
+ with gr.Tab("πŸ’¬ Chat with PDF"):
209
+ gr.Markdown("### Auto-Generated Questions")
210
+ auto_q_box = gr.Textbox(label="Generated Questions", lines=6, interactive=False)
211
+ gr.Markdown("### Ask Your Own Question")
212
+ user_q = gr.Textbox(label="Your Question", placeholder="Type your question here...")
213
+ ask_btn = gr.Button("Ask", variant="primary")
214
+ answer_box = gr.Textbox(label="Answer", lines=4, interactive=False)
215
+
216
+ # --- About Tab ---
217
+ with gr.Tab("ℹ️ About"):
218
+ gr.Markdown("""
219
+ ## πŸ“˜ About AI PDF Assistant
220
+ **AI PDF Assistant** helps you understand and interact with PDFs effortlessly.
221
+
222
+ ### Features
223
+ - Extracts and cleans text
224
+ - Generates adaptive summaries
225
+ - Identifies keywords
226
+ - Creates audio summaries
227
+ - Auto-generates Q&A
228
+ - Lets you chat with your PDF content
229
+
230
+ Built with ❀️ using Hugging Face Transformers, gTTS, and Gradio.
231
+ """)
232
+
233
+ # --- Event Connections ---
234
+ process_btn.click(
235
+ process_pdf,
236
+ inputs=[pdf_input],
237
+ outputs=[extracted_text, summary_box, audio_box, keywords_box, auto_q_box],
238
+ )
239
+
240
+ ask_btn.click(
241
+ answer_question,
242
+ inputs=[user_q, extracted_text],
243
+ outputs=[answer_box],
244
+ )
245
+
246
+ print("πŸš€ Launching AI PDF Assistant...")
247
+ demo.launch()