Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import re | |
| from newspaper import Article | |
| import requests | |
| import io | |
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from transformers import pipeline | |
| # Sumy and NLTK imports | |
| from nltk.tokenize import sent_tokenize | |
| from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.lsa import LsaSummarizer | |
| from sumy.nlp.stemmers import Stemmer | |
| from sumy.utils import get_stop_words | |
| classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| # -------- Summary Cleaning and Extraction -------- # | |
| def preprocess_text(text): | |
| if not isinstance(text, str): | |
| return "" | |
| text = re.sub(r'http\S+', ' ', text) | |
| lines = text.splitlines() | |
| kept = [] | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if re.match(r'By\s+\S+', line): continue | |
| if re.search(r'\bFollow\b', line): continue | |
| if re.search(r'\d+\s+min\s+read', line, flags=re.IGNORECASE): continue | |
| if re.search(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}\b', line): continue | |
| if line.lower().startswith(( | |
| "read more", "continue reading", "more from medium", | |
| "about the author", "related stories", "you might also like" | |
| )): continue | |
| if line.isupper() and len(line.split()) > 3: | |
| continue | |
| kept.append(line) | |
| text = "\n".join(kept) | |
| text = re.sub(r'[^\w\s.,!?;:]', ' ', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| sents = sent_tokenize(text) | |
| return ' '.join(dict.fromkeys([s for s in sents if len(s.split()) > 3])) | |
| def summarize_with_sumy_auto(text, summary_frac=0.2, min_sentences=3, max_sentences=10): | |
| if not isinstance(text, str): | |
| return "" | |
| cleaned = preprocess_text(text) | |
| orig = sent_tokenize(cleaned) | |
| total = len(orig) | |
| if total <= min_sentences: | |
| return ' '.join(orig) | |
| n = max(min_sentences, min(max_sentences, int(total * summary_frac))) | |
| parser = PlaintextParser.from_string(cleaned, Tokenizer("english")) | |
| stemmer = Stemmer("english") | |
| summarizer = LsaSummarizer(stemmer) | |
| summarizer.stop_words = get_stop_words("english") | |
| sents = summarizer(parser.document, n) | |
| return ' '.join(str(s) for s in sents) | |
| # -------- Utility Functions -------- # | |
| def check_url_status(url: str, timeout: int = 5) -> str: | |
| try: | |
| resp = requests.head(url, allow_redirects=True, timeout=timeout) | |
| if resp.status_code == 405: | |
| resp = requests.get(url, allow_redirects=True, timeout=timeout) | |
| return 'Workable' if resp.status_code == 200 else f'Not Workable ({resp.status_code})' | |
| except requests.RequestException: | |
| return 'Not Workable' | |
| def detect_keywords_and_score(content, url): | |
| keywords = [] | |
| score = 0 | |
| imarticus_found = False | |
| pga_link_found = False | |
| pga_link = "https://imarticus.org/postgraduate-program-in-data-science-analytics/" | |
| if content and re.search(r'imarticus', content, re.IGNORECASE): | |
| keywords.append('Imarticus') | |
| imarticus_found = True | |
| if pga_link in content or pga_link in url: | |
| pga_link_found = True | |
| if content and re.search(r'post graduate', content, re.IGNORECASE): | |
| keywords.append('post graduate') | |
| if imarticus_found: | |
| score = 5 if pga_link_found else 3 | |
| return keywords, score | |
| else: | |
| return [], 0 | |
| def detect_code_snippet(content): | |
| if not content: | |
| return False | |
| code_markers = [ | |
| r'```', r'<code>', r'</code>', r'\n ', r'\t', | |
| r'def ', r'class ', r'\{', r'\}', r';', r'\(', r'\)', r'import ', r'from ', r'print\(' | |
| ] | |
| for marker in code_markers: | |
| if re.search(marker, content): | |
| return True | |
| return False | |
| # ------ Originality Check -----------# | |
| def extract_blog_text(url): | |
| headers = {'User-Agent': 'Mozilla/5.0'} | |
| response = requests.get(url, headers=headers) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| paragraphs = soup.find_all('p') | |
| return ' '.join([p.get_text() for p in paragraphs]) | |
| def get_ai_generated_score(url, classifier=classifier): | |
| text = extract_blog_text(url) | |
| #classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| labels = ["Human-written", "AI-generated"] | |
| result = classifier(text, candidate_labels=labels) | |
| scores = dict(zip(result['labels'], result['scores'])) | |
| return scores.get("AI-generated", 0.0) | |
| # -------- Main Summary Extraction -------- # | |
| def extract_summary(file): | |
| df = pd.read_excel(file) | |
| total_blogs = len(df) | |
| imarticus_count = 0 | |
| code_snippet_count = 0 | |
| filtered_rows = [] | |
| full_analysis = [] | |
| for _, row in df.iterrows(): | |
| url = row.get("Blog Link(Medium link)") or row.get("URL") or row.get("url") | |
| if pd.isna(url): | |
| continue | |
| status = check_url_status(url) | |
| name = row.get("Participant") or row.get("Name") | |
| if not name: | |
| continue | |
| centre = row.get("Centre") or row.get("Center") | |
| if not centre: | |
| continue | |
| #originality = get_ai_generated_score(url) | |
| try: | |
| article = Article(url) | |
| article.download() | |
| article.parse() | |
| title = article.title | |
| content = article.text | |
| if len(content.strip()) == 0: | |
| continue | |
| summary = summarize_with_sumy_auto(content) | |
| keywords, score = detect_keywords_and_score(content, url) | |
| code_snippet = detect_code_snippet(content) | |
| if score > 0: | |
| imarticus_count += 1 | |
| if code_snippet: | |
| code_snippet_count += 1 | |
| filtered_rows.append({ | |
| "Participant": name, | |
| "Centre": centre, | |
| "URL": url, | |
| "Status": status, | |
| "Title": title, | |
| "Content": content, | |
| "Summary": summary, | |
| "Identified_Keywords": ', '.join(keywords) if keywords else "None", | |
| "Code_Snippet": code_snippet, | |
| "Score": score | |
| # "Originality(AI-Score)": originality | |
| }) | |
| full_analysis.append({ | |
| "Participant": name, | |
| "Centre": centre, | |
| "URL": url, | |
| "Title": title, | |
| "Identified_Keywords": ', '.join(keywords) if keywords else "None", | |
| "Code_Snippet": code_snippet, | |
| "Score": score, | |
| "Summary": summary, | |
| "Status": status | |
| # "Originality(AI-Score)": originality | |
| }) | |
| except Exception as e: | |
| print(f"Error processing {url}: {e}") | |
| continue | |
| filtered_df = pd.DataFrame(filtered_rows) | |
| full_df = pd.DataFrame(full_analysis) | |
| return ( | |
| str(total_blogs), | |
| str(code_snippet_count), | |
| str(imarticus_count), | |
| filtered_df, | |
| full_df | |
| ) | |
| def filter_analysis(full_df, status_filter, score_filter): | |
| df = full_df.copy() | |
| if status_filter != "All": | |
| df = df[df["Status"].str.contains(status_filter)] | |
| if score_filter != "All": | |
| df = df[df["Score"] == int(score_filter)] | |
| df = df[["Title", "Identified_Keywords", "Code_Snippet", "Score", "Summary"]] | |
| return df | |
| def download_file(full_df): | |
| if full_df is None or full_df.empty: | |
| print("No data to download.") | |
| return None | |
| output_dir = "./output" | |
| os.makedirs(output_dir, exist_ok=True) | |
| file_path = os.path.join(output_dir, "Full_Analysis.xlsx") | |
| try: | |
| full_df.to_excel(file_path, index=False) | |
| except Exception as e: | |
| print(f"Error saving file: {e}") | |
| return None | |
| return file_path | |
| def trigger_download(full_df): | |
| path = download_file(full_df) | |
| return path, gr.update(visible=True) if path else gr.update(visible=False) | |
| # -------- Gradio UI -------- # | |
| with gr.Blocks(css=""" | |
| .sidebar { background-color: #00664d; color: white; padding: 20px; height: 100%; border-radius: 10px; } | |
| .sidebar label, .sidebar h2, .sidebar h3, .sidebar span, .sidebar p { color: black !important; } | |
| .main-content { padding: 20px; background-color: #ffffff; border-radius: 10px; } | |
| h1, h3 { color: #00664d; } | |
| @media (min-width: 1024px) { | |
| .gr-block.gr-box { max-width: 1000px; margin: auto; } | |
| } | |
| """) as demo: | |
| with gr.Row(): | |
| with gr.Column(scale=1, elem_classes="sidebar"): | |
| gr.Markdown("## π Upload & Filter", elem_id="sidebar-title") | |
| file_input = gr.File(label="Upload Excel File (.xlsx)", file_types=[".xlsx"]) | |
| analyze_btn = gr.Button("Run Summary") | |
| gr.Markdown("## π Filter") | |
| status_filter = gr.Dropdown(["All", "Workable", "Not Workable"], label="Status", value="All") | |
| score_filter = gr.Dropdown(["All", "0", "3", "5"], label="Score", value="All") | |
| download_btn = gr.Button("Download Full Analysis") | |
| download_file_output = gr.File(label="") | |
| with gr.Column(scale=3, elem_classes="main-content"): | |
| gr.Markdown("<h1>π Blog Evaluator </h1>") | |
| gr.Markdown("<h3>Analyze blog URLs for educational content, keywords, and coding examples</h3>") | |
| with gr.Row(): | |
| total_blogs = gr.Textbox(label="Total Blogs", interactive=False) | |
| code_snippets = gr.Textbox(label="Blogs with Code Snippets", interactive=False) | |
| imarticus_hits = gr.Textbox(label="Blogs with 'Imarticus' Mentions", interactive=False) | |
| gr.Markdown("### π Filtered Results Table") | |
| full_table = gr.Dataframe( | |
| headers=["Participant", "Centre","URL","Status","Title","Content","Summary","Identified_Keywords", "Code_Snippet", "Score"], | |
| interactive=False, | |
| datatype=["str", "str", "str", "str", "str","str","str","str","bool","number"], | |
| row_count=10, | |
| col_count=(10, "fixed") | |
| ) | |
| gr.Markdown("### π Full Analyzed Blog Data Table") | |
| filtered_table = gr.Dataframe(headers=["URL", "Status", "Title", "Content", "Summary"], interactive=False) | |
| state_full_df = gr.State() | |
| def analyze(file): | |
| total, codes, imarts, filtered_df, full_df = extract_summary(file) | |
| return total, codes, imarts, filtered_df, full_df.values.tolist(), full_df | |
| def apply_filters(full_df, status, score): | |
| df = filter_analysis(full_df, status, score) | |
| return df.values.tolist() | |
| analyze_btn.click( | |
| fn=analyze, | |
| inputs=file_input, | |
| outputs=[total_blogs, code_snippets, imarticus_hits, filtered_table, full_table, state_full_df] | |
| ) | |
| status_filter.change( | |
| fn=apply_filters, | |
| inputs=[state_full_df, status_filter, score_filter], | |
| outputs=full_table | |
| ) | |
| score_filter.change( | |
| fn=apply_filters, | |
| inputs=[state_full_df, status_filter, score_filter], | |
| outputs=full_table | |
| ) | |
| download_btn.click( | |
| fn=download_file, | |
| inputs=state_full_df, | |
| outputs=download_file_output | |
| ) | |
| demo.launch(share=True) | |