Imarticuslearning commited on
Commit
cc6f067
Β·
verified Β·
1 Parent(s): d46f6ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +318 -0
app.py CHANGED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import re
4
+ from newspaper import Article
5
+ import requests
6
+ import io
7
+ import os
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ from transformers import pipeline
11
+
12
+ # Sumy and NLTK imports
13
+ from nltk.tokenize import sent_tokenize
14
+ from sumy.parsers.plaintext import PlaintextParser
15
+ from sumy.nlp.tokenizers import Tokenizer
16
+ from sumy.summarizers.lsa import LsaSummarizer
17
+ from sumy.nlp.stemmers import Stemmer
18
+ from sumy.utils import get_stop_words
19
+
20
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
21
+
22
+ # -------- Summary Cleaning and Extraction -------- #
23
+ def preprocess_text(text):
24
+ if not isinstance(text, str):
25
+ return ""
26
+ text = re.sub(r'http\S+', ' ', text)
27
+ lines = text.splitlines()
28
+ kept = []
29
+ for line in lines:
30
+ line = line.strip()
31
+ if not line:
32
+ continue
33
+ if re.match(r'By\s+\S+', line): continue
34
+ if re.search(r'\bFollow\b', line): continue
35
+ if re.search(r'\d+\s+min\s+read', line, flags=re.IGNORECASE): continue
36
+ if re.search(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}\b', line): continue
37
+ if line.lower().startswith((
38
+ "read more", "continue reading", "more from medium",
39
+ "about the author", "related stories", "you might also like"
40
+ )): continue
41
+ if line.isupper() and len(line.split()) > 3:
42
+ continue
43
+ kept.append(line)
44
+ text = "\n".join(kept)
45
+ text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
46
+ text = re.sub(r'\s+', ' ', text).strip()
47
+ sents = sent_tokenize(text)
48
+ return ' '.join(dict.fromkeys([s for s in sents if len(s.split()) > 3]))
49
+
50
+ def summarize_with_sumy_auto(text, summary_frac=0.2, min_sentences=3, max_sentences=10):
51
+ if not isinstance(text, str):
52
+ return ""
53
+ cleaned = preprocess_text(text)
54
+ orig = sent_tokenize(cleaned)
55
+ total = len(orig)
56
+ if total <= min_sentences:
57
+ return ' '.join(orig)
58
+ n = max(min_sentences, min(max_sentences, int(total * summary_frac)))
59
+ parser = PlaintextParser.from_string(cleaned, Tokenizer("english"))
60
+ stemmer = Stemmer("english")
61
+ summarizer = LsaSummarizer(stemmer)
62
+ summarizer.stop_words = get_stop_words("english")
63
+ sents = summarizer(parser.document, n)
64
+ return ' '.join(str(s) for s in sents)
65
+
66
+ # -------- Utility Functions -------- #
67
+ def check_url_status(url: str, timeout: int = 5) -> str:
68
+ try:
69
+ resp = requests.head(url, allow_redirects=True, timeout=timeout)
70
+ if resp.status_code == 405:
71
+ resp = requests.get(url, allow_redirects=True, timeout=timeout)
72
+ return 'Workable' if resp.status_code == 200 else f'Not Workable ({resp.status_code})'
73
+ except requests.RequestException:
74
+ return 'Not Workable'
75
+
76
+ def detect_keywords_and_score(content, url):
77
+ keywords = []
78
+ score = 0
79
+ imarticus_found = False
80
+ pga_link_found = False
81
+ pga_link = "https://imarticus.org/postgraduate-program-in-data-science-analytics/"
82
+ if content and re.search(r'imarticus', content, re.IGNORECASE):
83
+ keywords.append('Imarticus')
84
+ imarticus_found = True
85
+ if pga_link in content or pga_link in url:
86
+ pga_link_found = True
87
+ if content and re.search(r'post graduate', content, re.IGNORECASE):
88
+ keywords.append('post graduate')
89
+ if imarticus_found:
90
+ score = 5 if pga_link_found else 3
91
+ return keywords, score
92
+ else:
93
+ return [], 0
94
+
95
+ def detect_code_snippet(content):
96
+ if not content:
97
+ return False
98
+ code_markers = [
99
+ r'```', r'<code>', r'</code>', r'\n ', r'\t',
100
+ r'def ', r'class ', r'\{', r'\}', r';', r'\(', r'\)', r'import ', r'from ', r'print\('
101
+ ]
102
+ for marker in code_markers:
103
+ if re.search(marker, content):
104
+ return True
105
+ return False
106
+
107
+ # ------ Originality Check -----------#
108
+ def extract_blog_text(url):
109
+ headers = {'User-Agent': 'Mozilla/5.0'}
110
+ response = requests.get(url, headers=headers)
111
+ soup = BeautifulSoup(response.text, 'html.parser')
112
+ paragraphs = soup.find_all('p')
113
+ return ' '.join([p.get_text() for p in paragraphs])
114
+
115
+ def get_ai_generated_score(url, classifier=classifier):
116
+ text = extract_blog_text(url)
117
+ #classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
118
+ labels = ["Human-written", "AI-generated"]
119
+ result = classifier(text, candidate_labels=labels)
120
+ scores = dict(zip(result['labels'], result['scores']))
121
+ return scores.get("AI-generated", 0.0)
122
+
123
+ # -------- Main Summary Extraction -------- #
124
+ def extract_summary(file):
125
+ df = pd.read_excel(file)
126
+ total_blogs = len(df)
127
+ imarticus_count = 0
128
+ code_snippet_count = 0
129
+ filtered_rows = []
130
+ full_analysis = []
131
+
132
+ for _, row in df.iterrows():
133
+ url = row.get("Blog Link(Medium link)") or row.get("URL") or row.get("url")
134
+ if pd.isna(url):
135
+ continue
136
+
137
+ status = check_url_status(url)
138
+
139
+ name = row.get("Participant") or row.get("Name")
140
+ if not name:
141
+ continue
142
+
143
+ centre = row.get("Centre") or row.get("Center")
144
+ if not centre:
145
+ continue
146
+ #originality = get_ai_generated_score(url)
147
+
148
+ try:
149
+ article = Article(url)
150
+ article.download()
151
+ article.parse()
152
+ title = article.title
153
+ content = article.text
154
+
155
+ if len(content.strip()) == 0:
156
+ continue
157
+
158
+ summary = summarize_with_sumy_auto(content)
159
+
160
+ keywords, score = detect_keywords_and_score(content, url)
161
+ code_snippet = detect_code_snippet(content)
162
+
163
+ if score > 0:
164
+ imarticus_count += 1
165
+ if code_snippet:
166
+ code_snippet_count += 1
167
+
168
+ filtered_rows.append({
169
+ "Participant": name,
170
+ "Centre": centre,
171
+ "URL": url,
172
+ "Status": status,
173
+ "Title": title,
174
+ "Content": content,
175
+ "Summary": summary,
176
+ "Identified_Keywords": ', '.join(keywords) if keywords else "None",
177
+ "Code_Snippet": code_snippet,
178
+ "Score": score
179
+ # "Originality(AI-Score)": originality
180
+ })
181
+
182
+ full_analysis.append({
183
+ "Participant": name,
184
+ "Centre": centre,
185
+ "URL": url,
186
+ "Title": title,
187
+ "Identified_Keywords": ', '.join(keywords) if keywords else "None",
188
+ "Code_Snippet": code_snippet,
189
+ "Score": score,
190
+ "Summary": summary,
191
+ "Status": status
192
+ # "Originality(AI-Score)": originality
193
+ })
194
+
195
+ except Exception as e:
196
+ print(f"Error processing {url}: {e}")
197
+ continue
198
+
199
+ filtered_df = pd.DataFrame(filtered_rows)
200
+ full_df = pd.DataFrame(full_analysis)
201
+
202
+ return (
203
+ str(total_blogs),
204
+ str(code_snippet_count),
205
+ str(imarticus_count),
206
+ filtered_df,
207
+ full_df
208
+ )
209
+
210
+ def filter_analysis(full_df, status_filter, score_filter):
211
+ df = full_df.copy()
212
+ if status_filter != "All":
213
+ df = df[df["Status"].str.contains(status_filter)]
214
+ if score_filter != "All":
215
+ df = df[df["Score"] == int(score_filter)]
216
+ df = df[["Title", "Identified_Keywords", "Code_Snippet", "Score", "Summary"]]
217
+ return df
218
+
219
+ def download_file(full_df):
220
+ if full_df is None or full_df.empty:
221
+ print("No data to download.")
222
+ return None
223
+ output_dir = "./output"
224
+ os.makedirs(output_dir, exist_ok=True)
225
+ file_path = os.path.join(output_dir, "Full_Analysis.xlsx")
226
+ try:
227
+ full_df.to_excel(file_path, index=False)
228
+ except Exception as e:
229
+ print(f"Error saving file: {e}")
230
+ return None
231
+ return file_path
232
+
233
+ def trigger_download(full_df):
234
+ path = download_file(full_df)
235
+ return path, gr.update(visible=True) if path else gr.update(visible=False)
236
+
237
+ # -------- Gradio UI -------- #
238
+ with gr.Blocks(css="""
239
+ .sidebar { background-color: #00664d; color: white; padding: 20px; height: 100%; border-radius: 10px; }
240
+ .sidebar label, .sidebar h2, .sidebar h3, .sidebar span, .sidebar p { color: black !important; }
241
+ .main-content { padding: 20px; background-color: #ffffff; border-radius: 10px; }
242
+ h1, h3 { color: #00664d; }
243
+ @media (min-width: 1024px) {
244
+ .gr-block.gr-box { max-width: 1000px; margin: auto; }
245
+ }
246
+ """) as demo:
247
+
248
+ with gr.Row():
249
+ with gr.Column(scale=1, elem_classes="sidebar"):
250
+ gr.Markdown("## πŸ“… Upload & Filter", elem_id="sidebar-title")
251
+ file_input = gr.File(label="Upload Excel File (.xlsx)", file_types=[".xlsx"])
252
+ analyze_btn = gr.Button("Run Summary")
253
+
254
+ gr.Markdown("## πŸ”Ž Filter")
255
+ status_filter = gr.Dropdown(["All", "Workable", "Not Workable"], label="Status", value="All")
256
+ score_filter = gr.Dropdown(["All", "0", "3", "5"], label="Score", value="All")
257
+
258
+ download_btn = gr.Button("Download Full Analysis")
259
+ download_file_output = gr.File(label="")
260
+
261
+ with gr.Column(scale=3, elem_classes="main-content"):
262
+ gr.Markdown("<h1>πŸ“Š Educational Blog Analyzer</h1>")
263
+ gr.Markdown("<h3>Analyze blog URLs for educational content, keywords, and coding examples</h3>")
264
+
265
+ with gr.Row():
266
+ total_blogs = gr.Textbox(label="Total Blogs", interactive=False)
267
+ code_snippets = gr.Textbox(label="Blogs with Code Snippets", interactive=False)
268
+ imarticus_hits = gr.Textbox(label="Blogs with 'Imarticus' Mentions", interactive=False)
269
+
270
+ gr.Markdown("### πŸ“‹ Filtered Results Table")
271
+ full_table = gr.Dataframe(
272
+ headers=["Participant", "Centre","URL","Status","Title","Content","Summary","Identified_Keywords", "Code_Snippet", "Score"],
273
+ interactive=False,
274
+ datatype=["str", "str", "str", "str", "str","str","str","str","bool","number"],
275
+ row_count=10,
276
+ col_count=(10, "fixed")
277
+ )
278
+
279
+ gr.Markdown("### πŸ“‹ Full Analyzed Blog Data Table")
280
+ filtered_table = gr.Dataframe(headers=["URL", "Status", "Title", "Content", "Summary"], interactive=False)
281
+
282
+
283
+ state_full_df = gr.State()
284
+
285
+ def analyze(file):
286
+ total, codes, imarts, filtered_df, full_df = extract_summary(file)
287
+ return total, codes, imarts, filtered_df, full_df.values.tolist(), full_df
288
+
289
+ def apply_filters(full_df, status, score):
290
+ df = filter_analysis(full_df, status, score)
291
+ return df.values.tolist()
292
+
293
+ analyze_btn.click(
294
+ fn=analyze,
295
+ inputs=file_input,
296
+ outputs=[total_blogs, code_snippets, imarticus_hits, filtered_table, full_table, state_full_df]
297
+ )
298
+
299
+ status_filter.change(
300
+ fn=apply_filters,
301
+ inputs=[state_full_df, status_filter, score_filter],
302
+ outputs=full_table
303
+ )
304
+
305
+ score_filter.change(
306
+ fn=apply_filters,
307
+ inputs=[state_full_df, status_filter, score_filter],
308
+ outputs=full_table
309
+ )
310
+
311
+ download_btn.click(
312
+ fn=download_file,
313
+ inputs=state_full_df,
314
+ outputs=download_file_output
315
+ )
316
+
317
+
318
+ demo.launch(share=True)