import gradio as gr import pandas as pd import re from newspaper import Article import requests import io import os import requests from bs4 import BeautifulSoup from transformers import pipeline # Sumy and NLTK imports from nltk.tokenize import sent_tokenize from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") # -------- Summary Cleaning and Extraction -------- # def preprocess_text(text): if not isinstance(text, str): return "" text = re.sub(r'http\S+', ' ', text) lines = text.splitlines() kept = [] for line in lines: line = line.strip() if not line: continue if re.match(r'By\s+\S+', line): continue if re.search(r'\bFollow\b', line): continue if re.search(r'\d+\s+min\s+read', line, flags=re.IGNORECASE): continue if re.search(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}\b', line): continue if line.lower().startswith(( "read more", "continue reading", "more from medium", "about the author", "related stories", "you might also like" )): continue if line.isupper() and len(line.split()) > 3: continue kept.append(line) text = "\n".join(kept) text = re.sub(r'[^\w\s.,!?;:]', ' ', text) text = re.sub(r'\s+', ' ', text).strip() sents = sent_tokenize(text) return ' '.join(dict.fromkeys([s for s in sents if len(s.split()) > 3])) def summarize_with_sumy_auto(text, summary_frac=0.2, min_sentences=3, max_sentences=10): if not isinstance(text, str): return "" cleaned = preprocess_text(text) orig = sent_tokenize(cleaned) total = len(orig) if total <= min_sentences: return ' '.join(orig) n = max(min_sentences, min(max_sentences, int(total * summary_frac))) parser = PlaintextParser.from_string(cleaned, Tokenizer("english")) stemmer = Stemmer("english") summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words("english") sents = summarizer(parser.document, n) return ' '.join(str(s) for s in sents) # -------- Utility Functions -------- # def check_url_status(url: str, timeout: int = 5) -> str: try: resp = requests.head(url, allow_redirects=True, timeout=timeout) if resp.status_code == 405: resp = requests.get(url, allow_redirects=True, timeout=timeout) return 'Workable' if resp.status_code == 200 else f'Not Workable ({resp.status_code})' except requests.RequestException: return 'Not Workable' def detect_keywords_and_score(content, url): keywords = [] score = 0 imarticus_found = False pga_link_found = False pga_link = "https://imarticus.org/postgraduate-program-in-data-science-analytics/" if content and re.search(r'imarticus', content, re.IGNORECASE): keywords.append('Imarticus') imarticus_found = True if pga_link in content or pga_link in url: pga_link_found = True if content and re.search(r'post graduate', content, re.IGNORECASE): keywords.append('post graduate') if imarticus_found: score = 5 if pga_link_found else 3 return keywords, score else: return [], 0 def detect_code_snippet(content): if not content: return False code_markers = [ r'```', r'', r'', r'\n ', r'\t', r'def ', r'class ', r'\{', r'\}', r';', r'\(', r'\)', r'import ', r'from ', r'print\(' ] for marker in code_markers: if re.search(marker, content): return True return False # ------ Originality Check -----------# def extract_blog_text(url): headers = {'User-Agent': 'Mozilla/5.0'} response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') paragraphs = soup.find_all('p') return ' '.join([p.get_text() for p in paragraphs]) def get_ai_generated_score(url, classifier=classifier): text = extract_blog_text(url) #classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") labels = ["Human-written", "AI-generated"] result = classifier(text, candidate_labels=labels) scores = dict(zip(result['labels'], result['scores'])) return scores.get("AI-generated", 0.0) # -------- Main Summary Extraction -------- # def extract_summary(file): df = pd.read_excel(file) total_blogs = len(df) imarticus_count = 0 code_snippet_count = 0 filtered_rows = [] full_analysis = [] for _, row in df.iterrows(): url = row.get("Blog Link(Medium link)") or row.get("URL") or row.get("url") if pd.isna(url): continue status = check_url_status(url) name = row.get("Participant") or row.get("Name") if not name: continue centre = row.get("Centre") or row.get("Center") if not centre: continue #originality = get_ai_generated_score(url) try: article = Article(url) article.download() article.parse() title = article.title content = article.text if len(content.strip()) == 0: continue summary = summarize_with_sumy_auto(content) keywords, score = detect_keywords_and_score(content, url) code_snippet = detect_code_snippet(content) if score > 0: imarticus_count += 1 if code_snippet: code_snippet_count += 1 filtered_rows.append({ "Participant": name, "Centre": centre, "URL": url, "Status": status, "Title": title, "Content": content, "Summary": summary, "Identified_Keywords": ', '.join(keywords) if keywords else "None", "Code_Snippet": code_snippet, "Score": score # "Originality(AI-Score)": originality }) full_analysis.append({ "Participant": name, "Centre": centre, "URL": url, "Title": title, "Identified_Keywords": ', '.join(keywords) if keywords else "None", "Code_Snippet": code_snippet, "Score": score, "Summary": summary, "Status": status # "Originality(AI-Score)": originality }) except Exception as e: print(f"Error processing {url}: {e}") continue filtered_df = pd.DataFrame(filtered_rows) full_df = pd.DataFrame(full_analysis) return ( str(total_blogs), str(code_snippet_count), str(imarticus_count), filtered_df, full_df ) def filter_analysis(full_df, status_filter, score_filter): df = full_df.copy() if status_filter != "All": df = df[df["Status"].str.contains(status_filter)] if score_filter != "All": df = df[df["Score"] == int(score_filter)] df = df[["Title", "Identified_Keywords", "Code_Snippet", "Score", "Summary"]] return df def download_file(full_df): if full_df is None or full_df.empty: print("No data to download.") return None output_dir = "./output" os.makedirs(output_dir, exist_ok=True) file_path = os.path.join(output_dir, "Full_Analysis.xlsx") try: full_df.to_excel(file_path, index=False) except Exception as e: print(f"Error saving file: {e}") return None return file_path def trigger_download(full_df): path = download_file(full_df) return path, gr.update(visible=True) if path else gr.update(visible=False) # -------- Gradio UI -------- # with gr.Blocks(css=""" .sidebar { background-color: #00664d; color: white; padding: 20px; height: 100%; border-radius: 10px; } .sidebar label, .sidebar h2, .sidebar h3, .sidebar span, .sidebar p { color: black !important; } .main-content { padding: 20px; background-color: #ffffff; border-radius: 10px; } h1, h3 { color: #00664d; } @media (min-width: 1024px) { .gr-block.gr-box { max-width: 1000px; margin: auto; } } """) as demo: with gr.Row(): with gr.Column(scale=1, elem_classes="sidebar"): gr.Markdown("## 📅 Upload & Filter", elem_id="sidebar-title") file_input = gr.File(label="Upload Excel File (.xlsx)", file_types=[".xlsx"]) analyze_btn = gr.Button("Run Summary") gr.Markdown("## 🔎 Filter") status_filter = gr.Dropdown(["All", "Workable", "Not Workable"], label="Status", value="All") score_filter = gr.Dropdown(["All", "0", "3", "5"], label="Score", value="All") download_btn = gr.Button("Download Full Analysis") download_file_output = gr.File(label="") with gr.Column(scale=3, elem_classes="main-content"): gr.Markdown("

📊 Blog Evaluator

") gr.Markdown("

Analyze blog URLs for educational content, keywords, and coding examples

") with gr.Row(): total_blogs = gr.Textbox(label="Total Blogs", interactive=False) code_snippets = gr.Textbox(label="Blogs with Code Snippets", interactive=False) imarticus_hits = gr.Textbox(label="Blogs with 'Imarticus' Mentions", interactive=False) gr.Markdown("### 📋 Filtered Results Table") full_table = gr.Dataframe( headers=["Participant", "Centre","URL","Status","Title","Content","Summary","Identified_Keywords", "Code_Snippet", "Score"], interactive=False, datatype=["str", "str", "str", "str", "str","str","str","str","bool","number"], row_count=10, col_count=(10, "fixed") ) gr.Markdown("### 📋 Full Analyzed Blog Data Table") filtered_table = gr.Dataframe(headers=["URL", "Status", "Title", "Content", "Summary"], interactive=False) state_full_df = gr.State() def analyze(file): total, codes, imarts, filtered_df, full_df = extract_summary(file) return total, codes, imarts, filtered_df, full_df.values.tolist(), full_df def apply_filters(full_df, status, score): df = filter_analysis(full_df, status, score) return df.values.tolist() analyze_btn.click( fn=analyze, inputs=file_input, outputs=[total_blogs, code_snippets, imarticus_hits, filtered_table, full_table, state_full_df] ) status_filter.change( fn=apply_filters, inputs=[state_full_df, status_filter, score_filter], outputs=full_table ) score_filter.change( fn=apply_filters, inputs=[state_full_df, status_filter, score_filter], outputs=full_table ) download_btn.click( fn=download_file, inputs=state_full_df, outputs=download_file_output ) demo.launch(share=True)