import gradio as gr
import pandas as pd
import re
from newspaper import Article
import requests
import io
import os
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
# Sumy and NLTK imports
from nltk.tokenize import sent_tokenize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# -------- Summary Cleaning and Extraction -------- #
def preprocess_text(text):
if not isinstance(text, str):
return ""
text = re.sub(r'http\S+', ' ', text)
lines = text.splitlines()
kept = []
for line in lines:
line = line.strip()
if not line:
continue
if re.match(r'By\s+\S+', line): continue
if re.search(r'\bFollow\b', line): continue
if re.search(r'\d+\s+min\s+read', line, flags=re.IGNORECASE): continue
if re.search(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}\b', line): continue
if line.lower().startswith((
"read more", "continue reading", "more from medium",
"about the author", "related stories", "you might also like"
)): continue
if line.isupper() and len(line.split()) > 3:
continue
kept.append(line)
text = "\n".join(kept)
text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
sents = sent_tokenize(text)
return ' '.join(dict.fromkeys([s for s in sents if len(s.split()) > 3]))
def summarize_with_sumy_auto(text, summary_frac=0.2, min_sentences=3, max_sentences=10):
if not isinstance(text, str):
return ""
cleaned = preprocess_text(text)
orig = sent_tokenize(cleaned)
total = len(orig)
if total <= min_sentences:
return ' '.join(orig)
n = max(min_sentences, min(max_sentences, int(total * summary_frac)))
parser = PlaintextParser.from_string(cleaned, Tokenizer("english"))
stemmer = Stemmer("english")
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words("english")
sents = summarizer(parser.document, n)
return ' '.join(str(s) for s in sents)
# -------- Utility Functions -------- #
def check_url_status(url: str, timeout: int = 5) -> str:
try:
resp = requests.head(url, allow_redirects=True, timeout=timeout)
if resp.status_code == 405:
resp = requests.get(url, allow_redirects=True, timeout=timeout)
return 'Workable' if resp.status_code == 200 else f'Not Workable ({resp.status_code})'
except requests.RequestException:
return 'Not Workable'
def detect_keywords_and_score(content, url):
keywords = []
score = 0
imarticus_found = False
pga_link_found = False
pga_link = "https://imarticus.org/postgraduate-program-in-data-science-analytics/"
if content and re.search(r'imarticus', content, re.IGNORECASE):
keywords.append('Imarticus')
imarticus_found = True
if pga_link in content or pga_link in url:
pga_link_found = True
if content and re.search(r'post graduate', content, re.IGNORECASE):
keywords.append('post graduate')
if imarticus_found:
score = 5 if pga_link_found else 3
return keywords, score
else:
return [], 0
def detect_code_snippet(content):
if not content:
return False
code_markers = [
r'```', r'', r'', r'\n ', r'\t',
r'def ', r'class ', r'\{', r'\}', r';', r'\(', r'\)', r'import ', r'from ', r'print\('
]
for marker in code_markers:
if re.search(marker, content):
return True
return False
# ------ Originality Check -----------#
def extract_blog_text(url):
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
return ' '.join([p.get_text() for p in paragraphs])
def get_ai_generated_score(url, classifier=classifier):
text = extract_blog_text(url)
#classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
labels = ["Human-written", "AI-generated"]
result = classifier(text, candidate_labels=labels)
scores = dict(zip(result['labels'], result['scores']))
return scores.get("AI-generated", 0.0)
# -------- Main Summary Extraction -------- #
def extract_summary(file):
df = pd.read_excel(file)
total_blogs = len(df)
imarticus_count = 0
code_snippet_count = 0
filtered_rows = []
full_analysis = []
for _, row in df.iterrows():
url = row.get("Blog Link(Medium link)") or row.get("URL") or row.get("url")
if pd.isna(url):
continue
status = check_url_status(url)
name = row.get("Participant") or row.get("Name")
if not name:
continue
centre = row.get("Centre") or row.get("Center")
if not centre:
continue
#originality = get_ai_generated_score(url)
try:
article = Article(url)
article.download()
article.parse()
title = article.title
content = article.text
if len(content.strip()) == 0:
continue
summary = summarize_with_sumy_auto(content)
keywords, score = detect_keywords_and_score(content, url)
code_snippet = detect_code_snippet(content)
if score > 0:
imarticus_count += 1
if code_snippet:
code_snippet_count += 1
filtered_rows.append({
"Participant": name,
"Centre": centre,
"URL": url,
"Status": status,
"Title": title,
"Content": content,
"Summary": summary,
"Identified_Keywords": ', '.join(keywords) if keywords else "None",
"Code_Snippet": code_snippet,
"Score": score
# "Originality(AI-Score)": originality
})
full_analysis.append({
"Participant": name,
"Centre": centre,
"URL": url,
"Title": title,
"Identified_Keywords": ', '.join(keywords) if keywords else "None",
"Code_Snippet": code_snippet,
"Score": score,
"Summary": summary,
"Status": status
# "Originality(AI-Score)": originality
})
except Exception as e:
print(f"Error processing {url}: {e}")
continue
filtered_df = pd.DataFrame(filtered_rows)
full_df = pd.DataFrame(full_analysis)
return (
str(total_blogs),
str(code_snippet_count),
str(imarticus_count),
filtered_df,
full_df
)
def filter_analysis(full_df, status_filter, score_filter):
df = full_df.copy()
if status_filter != "All":
df = df[df["Status"].str.contains(status_filter)]
if score_filter != "All":
df = df[df["Score"] == int(score_filter)]
df = df[["Title", "Identified_Keywords", "Code_Snippet", "Score", "Summary"]]
return df
def download_file(full_df):
if full_df is None or full_df.empty:
print("No data to download.")
return None
output_dir = "./output"
os.makedirs(output_dir, exist_ok=True)
file_path = os.path.join(output_dir, "Full_Analysis.xlsx")
try:
full_df.to_excel(file_path, index=False)
except Exception as e:
print(f"Error saving file: {e}")
return None
return file_path
def trigger_download(full_df):
path = download_file(full_df)
return path, gr.update(visible=True) if path else gr.update(visible=False)
# -------- Gradio UI -------- #
with gr.Blocks(css="""
.sidebar { background-color: #00664d; color: white; padding: 20px; height: 100%; border-radius: 10px; }
.sidebar label, .sidebar h2, .sidebar h3, .sidebar span, .sidebar p { color: black !important; }
.main-content { padding: 20px; background-color: #ffffff; border-radius: 10px; }
h1, h3 { color: #00664d; }
@media (min-width: 1024px) {
.gr-block.gr-box { max-width: 1000px; margin: auto; }
}
""") as demo:
with gr.Row():
with gr.Column(scale=1, elem_classes="sidebar"):
gr.Markdown("## 📅 Upload & Filter", elem_id="sidebar-title")
file_input = gr.File(label="Upload Excel File (.xlsx)", file_types=[".xlsx"])
analyze_btn = gr.Button("Run Summary")
gr.Markdown("## 🔎 Filter")
status_filter = gr.Dropdown(["All", "Workable", "Not Workable"], label="Status", value="All")
score_filter = gr.Dropdown(["All", "0", "3", "5"], label="Score", value="All")
download_btn = gr.Button("Download Full Analysis")
download_file_output = gr.File(label="")
with gr.Column(scale=3, elem_classes="main-content"):
gr.Markdown("