Spaces:

Imarticuslearning
/

Blog_Evaluator

Sleeping

App Files Files Community

Imarticuslearning commited on Jun 3, 2025

Commit

cc6f067

verified ·

1 Parent(s): d46f6ab

Update app.py

Browse files

Files changed (1) hide show

app.py +318 -0

app.py CHANGED Viewed

	@@ -0,0 +1,318 @@

+import gradio as gr
+import pandas as pd
+import re
+from newspaper import Article
+import requests
+import io
+import os
+import requests
+from bs4 import BeautifulSoup
+from transformers import pipeline
+# Sumy and NLTK imports
+from nltk.tokenize import sent_tokenize
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.lsa import LsaSummarizer
+from sumy.nlp.stemmers import Stemmer
+from sumy.utils import get_stop_words
+classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+# -------- Summary Cleaning and Extraction -------- #
+def preprocess_text(text):
+    if not isinstance(text, str):
+        return ""
+    text = re.sub(r'http\S+', ' ', text)
+    lines = text.splitlines()
+    kept = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        if re.match(r'By\s+\S+', line): continue
+        if re.search(r'\bFollow\b', line): continue
+        if re.search(r'\d+\s+min\s+read', line, flags=re.IGNORECASE): continue
+        if re.search(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}\b', line): continue
+        if line.lower().startswith((
+            "read more", "continue reading", "more from medium",
+            "about the author", "related stories", "you might also like"
+        )): continue
+        if line.isupper() and len(line.split()) > 3:
+            continue
+        kept.append(line)
+    text = "\n".join(kept)
+    text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    sents = sent_tokenize(text)
+    return ' '.join(dict.fromkeys([s for s in sents if len(s.split()) > 3]))
+def summarize_with_sumy_auto(text, summary_frac=0.2, min_sentences=3, max_sentences=10):
+    if not isinstance(text, str):
+        return ""
+    cleaned = preprocess_text(text)
+    orig = sent_tokenize(cleaned)
+    total = len(orig)
+    if total <= min_sentences:
+        return ' '.join(orig)
+    n = max(min_sentences, min(max_sentences, int(total * summary_frac)))
+    parser = PlaintextParser.from_string(cleaned, Tokenizer("english"))
+    stemmer = Stemmer("english")
+    summarizer = LsaSummarizer(stemmer)
+    summarizer.stop_words = get_stop_words("english")
+    sents = summarizer(parser.document, n)
+    return ' '.join(str(s) for s in sents)
+# -------- Utility Functions -------- #
+def check_url_status(url: str, timeout: int = 5) -> str:
+    try:
+        resp = requests.head(url, allow_redirects=True, timeout=timeout)
+        if resp.status_code == 405:
+            resp = requests.get(url, allow_redirects=True, timeout=timeout)
+        return 'Workable' if resp.status_code == 200 else f'Not Workable ({resp.status_code})'
+    except requests.RequestException:
+        return 'Not Workable'
+def detect_keywords_and_score(content, url):
+    keywords = []
+    score = 0
+    imarticus_found = False
+    pga_link_found = False
+    pga_link = "https://imarticus.org/postgraduate-program-in-data-science-analytics/"
+    if content and re.search(r'imarticus', content, re.IGNORECASE):
+        keywords.append('Imarticus')
+        imarticus_found = True
+        if pga_link in content or pga_link in url:
+            pga_link_found = True
+    if content and re.search(r'post graduate', content, re.IGNORECASE):
+        keywords.append('post graduate')
+    if imarticus_found:
+        score = 5 if pga_link_found else 3
+        return keywords, score
+    else:
+        return [], 0
+def detect_code_snippet(content):
+    if not content:
+        return False
+    code_markers = [
+        r'```', r'<code>', r'</code>', r'\n    ', r'\t',
+        r'def ', r'class ', r'\{', r'\}', r';', r'\(', r'\)', r'import ', r'from ', r'print\('
+    ]
+    for marker in code_markers:
+        if re.search(marker, content):
+            return True
+    return False
+# ------ Originality Check -----------#
+def extract_blog_text(url):
+    headers = {'User-Agent': 'Mozilla/5.0'}
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    paragraphs = soup.find_all('p')
+    return ' '.join([p.get_text() for p in paragraphs])
+def get_ai_generated_score(url, classifier=classifier):
+    text = extract_blog_text(url)
+    #classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+    labels = ["Human-written", "AI-generated"]
+    result = classifier(text, candidate_labels=labels)
+    scores = dict(zip(result['labels'], result['scores']))
+    return scores.get("AI-generated", 0.0)
+# -------- Main Summary Extraction -------- #
+def extract_summary(file):
+    df = pd.read_excel(file)
+    total_blogs = len(df)
+    imarticus_count = 0
+    code_snippet_count = 0
+    filtered_rows = []
+    full_analysis = []
+    for _, row in df.iterrows():
+        url = row.get("Blog Link(Medium link)") or row.get("URL") or row.get("url")
+        if pd.isna(url):
+            continue
+        status = check_url_status(url)
+        name = row.get("Participant") or row.get("Name")
+        if not name:
+            continue
+        centre = row.get("Centre") or row.get("Center")
+        if not centre:
+            continue
+        #originality = get_ai_generated_score(url)
+        try:
+            article = Article(url)
+            article.download()
+            article.parse()
+            title = article.title
+            content = article.text
+            if len(content.strip()) == 0:
+                continue
+            summary = summarize_with_sumy_auto(content)
+            keywords, score = detect_keywords_and_score(content, url)
+            code_snippet = detect_code_snippet(content)
+            if score > 0:
+                imarticus_count += 1
+            if code_snippet:
+                code_snippet_count += 1
+            filtered_rows.append({
+                "Participant": name,
+                "Centre": centre,
+                "URL": url,
+                "Status": status,
+                "Title": title,
+                "Content": content,
+                "Summary": summary,
+                "Identified_Keywords": ', '.join(keywords) if keywords else "None",
+                "Code_Snippet": code_snippet,
+                "Score": score
+               # "Originality(AI-Score)": originality
+            })
+            full_analysis.append({
+                "Participant": name,
+                "Centre": centre,
+                "URL": url,
+                "Title": title,
+                "Identified_Keywords": ', '.join(keywords) if keywords else "None",
+                "Code_Snippet": code_snippet,
+                "Score": score,
+                "Summary": summary,
+                "Status": status
+               # "Originality(AI-Score)": originality
+            })
+        except Exception as e:
+            print(f"Error processing {url}: {e}")
+            continue
+    filtered_df = pd.DataFrame(filtered_rows)
+    full_df = pd.DataFrame(full_analysis)
+    return (
+        str(total_blogs),
+        str(code_snippet_count),
+        str(imarticus_count),
+        filtered_df,
+        full_df
+    )
+def filter_analysis(full_df, status_filter, score_filter):
+    df = full_df.copy()
+    if status_filter != "All":
+        df = df[df["Status"].str.contains(status_filter)]
+    if score_filter != "All":
+        df = df[df["Score"] == int(score_filter)]
+    df = df[["Title", "Identified_Keywords", "Code_Snippet", "Score", "Summary"]]
+    return df
+def download_file(full_df):
+    if full_df is None or full_df.empty:
+        print("No data to download.")
+        return None
+    output_dir = "./output"
+    os.makedirs(output_dir, exist_ok=True)
+    file_path = os.path.join(output_dir, "Full_Analysis.xlsx")
+    try:
+        full_df.to_excel(file_path, index=False)
+    except Exception as e:
+        print(f"Error saving file: {e}")
+        return None
+    return file_path
+def trigger_download(full_df):
+    path = download_file(full_df)
+    return path, gr.update(visible=True) if path else gr.update(visible=False)
+# -------- Gradio UI -------- #
+with gr.Blocks(css="""
+    .sidebar { background-color: #00664d; color: white; padding: 20px; height: 100%; border-radius: 10px; }
+    .sidebar label, .sidebar h2, .sidebar h3, .sidebar span, .sidebar p { color: black !important; }
+    .main-content { padding: 20px; background-color: #ffffff; border-radius: 10px; }
+    h1, h3 { color: #00664d; }
+    @media (min-width: 1024px) {
+        .gr-block.gr-box { max-width: 1000px; margin: auto; }
+    }
+""") as demo:
+    with gr.Row():
+        with gr.Column(scale=1, elem_classes="sidebar"):
+            gr.Markdown("## 📅 Upload & Filter", elem_id="sidebar-title")
+            file_input = gr.File(label="Upload Excel File (.xlsx)", file_types=[".xlsx"])
+            analyze_btn = gr.Button("Run Summary")
+            gr.Markdown("## 🔎 Filter")
+            status_filter = gr.Dropdown(["All", "Workable", "Not Workable"], label="Status", value="All")
+            score_filter = gr.Dropdown(["All", "0", "3", "5"], label="Score", value="All")
+            download_btn = gr.Button("Download Full Analysis")
+            download_file_output = gr.File(label="")
+        with gr.Column(scale=3, elem_classes="main-content"):
+            gr.Markdown("<h1>📊 Educational Blog Analyzer</h1>")
+            gr.Markdown("<h3>Analyze blog URLs for educational content, keywords, and coding examples</h3>")
+            with gr.Row():
+                total_blogs = gr.Textbox(label="Total Blogs", interactive=False)
+                code_snippets = gr.Textbox(label="Blogs with Code Snippets", interactive=False)
+                imarticus_hits = gr.Textbox(label="Blogs with 'Imarticus' Mentions", interactive=False)
+            gr.Markdown("### 📋 Filtered Results Table")
+            full_table = gr.Dataframe(
+                headers=["Participant", "Centre","URL","Status","Title","Content","Summary","Identified_Keywords", "Code_Snippet", "Score"],
+                interactive=False,
+                datatype=["str", "str", "str", "str", "str","str","str","str","bool","number"],
+                row_count=10,
+                col_count=(10, "fixed")
+            )
+            gr.Markdown("### 📋 Full Analyzed Blog Data Table")
+            filtered_table = gr.Dataframe(headers=["URL", "Status", "Title", "Content", "Summary"], interactive=False)
+    state_full_df = gr.State()
+    def analyze(file):
+        total, codes, imarts, filtered_df, full_df = extract_summary(file)
+        return total, codes, imarts, filtered_df, full_df.values.tolist(), full_df
+    def apply_filters(full_df, status, score):
+        df = filter_analysis(full_df, status, score)
+        return df.values.tolist()
+    analyze_btn.click(
+        fn=analyze,
+        inputs=file_input,
+        outputs=[total_blogs, code_snippets, imarticus_hits, filtered_table, full_table, state_full_df]
+    )
+    status_filter.change(
+        fn=apply_filters,
+        inputs=[state_full_df, status_filter, score_filter],
+        outputs=full_table
+    )
+    score_filter.change(
+        fn=apply_filters,
+        inputs=[state_full_df, status_filter, score_filter],
+        outputs=full_table
+    )
+    download_btn.click(
+    fn=download_file,
+    inputs=state_full_df,
+    outputs=download_file_output
+    )
+demo.launch(share=True)