Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Handouts / app4.py

Deevyankar

Rename app.py to app4.py

d47510d verified 3 months ago

raw

history blame contribute delete

4.1 kB


	import gradio as gr
	from PyPDF2 import PdfReader
	import io
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import re


	def extract_text_from_pdf(pdf_file):
	try:
	reader = PdfReader(io.BytesIO(pdf_file))
	full_text = ""
	for page in reader.pages:
	text = page.extract_text()
	if text:
	full_text += text
	return full_text.strip()
	except Exception as e:
	print("PDF extraction error:", e)
	return ""


	def semantic_match(lo_list, content):
	lo_texts = [lo for lo in lo_list if lo.strip()]
	vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
	vectors = vectorizer.toarray()
	content_vec = vectors[0]
	scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
	return scores


	def compare_all(old_pdf, new_pdf, lo_file):
	try:
	los = lo_file.decode("utf-8", errors="ignore").splitlines()
	los = [lo.strip() for lo in los if lo.strip()]
	except:
	return "❌ Could not read learning outcomes file.", None, None, None

	old_text = extract_text_from_pdf(old_pdf)
	new_text = extract_text_from_pdf(new_pdf)

	if not old_text or not new_text:
	return "❌ Could not extract text from one or both PDFs.", None, None, None

	old_scores = semantic_match(los, old_text)
	new_scores = semantic_match(los, new_text)

	labels = [f"LO{i+1}" for i in range(len(los))]
	x = np.arange(len(labels))

	# Plot
	fig, ax = plt.subplots()
	ax.bar(x - 0.2, old_scores, width=0.4, label="Old", align='center')
	ax.bar(x + 0.2, new_scores, width=0.4, label="New", align='center')
	ax.set_xticks(x)
	ax.set_xticklabels(labels, rotation=45)
	ax.set_ylabel("Semantic Match Score")
	ax.set_title("Learning Outcomes Comparison")
	ax.legend()

	# Table
	data = {
	"Learning Outcome": labels,
	"Old Match (%)": [round(s * 100, 2) for s in old_scores],
	"New Match (%)": [round(s * 100, 2) for s in new_scores],
	"Change (%)": [round((new - old) * 100, 2) for new, old in zip(new_scores, old_scores)]
	}
	df = pd.DataFrame(data)

	# Content similarity
	tfidf = TfidfVectorizer().fit_transform([old_text, new_text])
	cosine_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0] * 100
	content_diff = 100 - round(cosine_sim, 2)

	# Text size change
	len_old = len(re.findall(r'\w+', old_text))
	len_new = len(re.findall(r'\w+', new_text))
	word_change_percent = round(((len_new - len_old) / len_old) * 100, 2)

	summary = f"""
	📘 Summary of Comparison

	📈 Overall Content Change: {content_diff:.2f}%
	🔍 This is based on TF-IDF cosine similarity between old and new handouts.

	📝 Text Length Difference: {'+' if word_change_percent >= 0 else ''}{word_change_percent:.2f}%
	Compared by total number of words in both handouts.

	🎯 Learning Outcome Matches: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
	✅ New handout appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with stated outcomes.
	"""

	return summary.strip(), df, fig, "✅ Comparison completed successfully."


	iface = gr.Interface(
	fn=compare_all,
	inputs=[
	gr.File(label="Old Handout PDF", type='binary'),
	gr.File(label="New Handout PDF", type='binary'),
	gr.File(label="Learning Outcomes (Text File)", type='binary'),
	],
	outputs=[
	gr.Textbox(label="📘 Summary & Insights", lines=20, max_lines=25),
	gr.Dataframe(label="📊 LO-wise Comparison Table"),
	gr.Plot(label="📈 LO Visual Comparison"),
	gr.Textbox(label="ℹ️ Status", lines=1)
	],
	title="📘 Handout Comparator + LO Analyzer",
	description="Upload OLD and NEW handouts in PDF format along with a TXT file of Learning Outcomes. The app compares content changes and evaluates alignment with LOs visually and in table format."
	)

	iface.launch()