Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 22, 2025

Commit

671e233

verified ·

1 Parent(s): e0b1653

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -80

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 from PyPDF2 import PdfReader
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -8,71 +9,70 @@ import matplotlib.pyplot as plt
 import pandas as pd
 import io
-# Load Sentence Transformer model
-model = SentenceTransformer('all-MiniLM-L6-v2')
-def extract_text_from_pdf(pdf_file):
     try:
-        reader = PdfReader(io.BytesIO(pdf_file))
-        full_text = ""
-        for page in reader.pages:
-            text = page.extract_text()
-            if text:
-                full_text += text
-        return full_text
     except Exception as e:
         return ""
-def semantic_match(lo_list, content):
-    lo_texts = [lo for lo in lo_list if lo.strip()]
-    vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
-    vectors = vectorizer.toarray()
-    content_vec = vectors[0]
-    scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
-    return scores
 def transformer_similarity(text1, text2):
-    emb1 = model.encode(text1, convert_to_tensor=True)
-    emb2 = model.encode(text2, convert_to_tensor=True)
-    return float(util.cos_sim(emb1, emb2))
-def calculate_quality_index(new_text):
-    levels = {
-        "Remember": ["define", "list", "recall"],
-        "Understand": ["explain", "summarize", "classify"],
-        "Apply": ["demonstrate", "use", "implement"],
-        "Analyze": ["differentiate", "organize", "compare"],
-        "Evaluate": ["justify", "criticize", "conclude"],
-        "Create": ["design", "construct", "develop"]
     }
-    quality_score = 0
-    for level, keywords in levels.items():
-        for word in keywords:
-            if word in new_text.lower():
-                quality_score += list(levels).index(level) + 1
-                break
-    return round(quality_score / (len(levels) * 1.0) * 100, 2)
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
-        los = lo_file.decode("utf-8", errors="ignore").splitlines()
-        los = [lo.strip() for lo in los if lo.strip()]
-    except:
         return "❌ Could not read learning outcomes file.", None, None, None
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
-    if not old_text.strip() or not new_text.strip():
         return "❌ Could not extract text from one or both PDFs.", None, None, None
-    old_scores = semantic_match(los, old_text)
-    new_scores = semantic_match(los, new_text)
-    # Bar Plot
     labels = [f"LO{i+1}" for i in range(len(los))]
     x = range(len(labels))
-    fig, ax = plt.subplots()
     ax.bar(x, old_scores, width=0.4, label="Old", align='center')
     ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
     ax.set_xticks([i + 0.2 for i in x])
@@ -81,63 +81,45 @@ def compare_all(old_pdf, new_pdf, lo_file):
     ax.set_title("Learning Outcomes Comparison")
     ax.legend()
-    # Table
     data = {
         "Learning Outcome": labels,
         "Old Match": [round(s*100, 2) for s in old_scores],
         "New Match": [round(s*100, 2) for s in new_scores],
-        "Change (%)": [round((new - old)*100, 2) for new, old in zip(new_scores, old_scores)]
     }
     df = pd.DataFrame(data)
-    # TF-IDF Cosine Similarity
-    tfidf_sim = cosine_similarity(
-        TfidfVectorizer().fit_transform([old_text, new_text])
-    )[0][1]
-    tfidf_percent = round(tfidf_sim * 100, 2)
-    # Sentence Transformer Similarity
-    try:
-        trans_sim = round(transformer_similarity(old_text, new_text) * 100, 2)
-    except:
-        trans_sim = "N/A"
-    # Text Length Change
-    length_change = round(((len(new_text) - len(old_text)) / len(old_text)) * 100, 2)
-    # Bloom's Taxonomy Quality Index
-    quality_index = calculate_quality_index(new_text)
-    # Summary Box
-    summary = f"""
-📘 **Summary of Comparison**
-📈 **TF-IDF Content Similarity**: {tfidf_percent}%
-🧠 **Transformer-based Semantic Similarity**: {trans_sim}%
-✍️ **Text Length Change**: {length_change}% ({'more' if length_change > 0 else 'less'} content)
-🌟 **Quality Index (Bloom's Taxonomy)**: {quality_index}%
-🎯 **Matched Learning Outcomes**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
-📊 **Insight**: New content appears {'better' if sum(new_scores) > sum(old_scores) else 'less'} aligned with outcomes.
 """
-    return summary, df, fig, quality_index
 iface = gr.Interface(
     fn=compare_all,
     inputs=[
         gr.File(label="Old Handout PDF", type='binary'),
         gr.File(label="New Handout PDF", type='binary'),
-        gr.File(label="Learning Outcomes (TXT)", type='binary'),
     ],
     outputs=[
-        gr.Markdown(label="📘 Summary & Quality"),
         gr.Dataframe(label="📊 LO-wise Comparison Table"),
-        gr.Plot(label="📉 Visual LO Alignment"),
-        gr.Number(label="🌟 Bloom’s Quality Index (%)")
     ],
-    title="🧾 Handout Comparator + Bloom Level Analyzer",
-    description="Compare two handouts and detect content change, alignment with outcomes, and cognitive improvement using AI & Bloom's Taxonomy."
 )
 iface.launch()

 import gradio as gr
 from PyPDF2 import PdfReader
 from sklearn.feature_extraction.text import TfidfVectorizer
 import pandas as pd
 import io
+# Load transformer model for semantic similarity
+model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
+def extract_text_from_pdf(file_bytes):
     try:
+        reader = PdfReader(io.BytesIO(file_bytes))
+        return " ".join([page.extract_text() or "" for page in reader.pages]).strip()
     except Exception as e:
+        print("Error extracting text:", e)
         return ""
+def tfidf_similarity(text1, text2):
+    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
+    return cosine_similarity(vectorizer[0:1], vectorizer[1:2])[0][0]
 def transformer_similarity(text1, text2):
+    embeddings = model.encode([text1, text2], convert_to_tensor=True)
+    return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
+def bloom_level(term):
+    term = term.lower()
+    blooms = {
+        "remember": ["define", "list", "recall", "identify"],
+        "understand": ["explain", "describe", "summarize"],
+        "apply": ["apply", "demonstrate", "use"],
+        "analyze": ["analyze", "compare", "contrast"],
+        "evaluate": ["evaluate", "judge", "critique"],
+        "create": ["create", "design", "formulate"]
     }
+    for level, keywords in blooms.items():
+        if any(word in term for word in keywords):
+            return level.capitalize()
+    return "Unknown"
+def lo_semantic_scores(los, content):
+    scores = []
+    for lo in los:
+        score = transformer_similarity(lo, content)
+        scores.append(score)
+    return scores
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
+        lo_content = lo_file.read().decode("utf-8", errors="ignore") if hasattr(lo_file, "read") else lo_file.decode("utf-8", errors="ignore")
+        los = [line.strip() for line in lo_content.splitlines() if line.strip()]
+    except Exception as e:
         return "❌ Could not read learning outcomes file.", None, None, None
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
+    if not old_text or not new_text:
         return "❌ Could not extract text from one or both PDFs.", None, None, None
+    tfidf_sim = tfidf_similarity(old_text, new_text)
+    transformer_sim = transformer_similarity(old_text, new_text)
+    text_growth = round(((len(new_text) - len(old_text)) / len(old_text)) * 100, 2)
+    old_scores = lo_semantic_scores(los, old_text)
+    new_scores = lo_semantic_scores(los, new_text)
     labels = [f"LO{i+1}" for i in range(len(los))]
     x = range(len(labels))
+    fig, ax = plt.subplots(figsize=(10, 5))
     ax.bar(x, old_scores, width=0.4, label="Old", align='center')
     ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
     ax.set_xticks([i + 0.2 for i in x])
     ax.set_title("Learning Outcomes Comparison")
     ax.legend()
     data = {
         "Learning Outcome": labels,
+        "LO Text": los,
+        "Bloom Level": [bloom_level(lo) for lo in los],
         "Old Match": [round(s*100, 2) for s in old_scores],
         "New Match": [round(s*100, 2) for s in new_scores],
+        "Change (%)": [round((n - o)*100, 2) for n, o in zip(new_scores, old_scores)]
     }
     df = pd.DataFrame(data)
+    summary = f"""📘 **Summary of Comparison**
+📈 **TF-IDF Content Change**: {round((1 - tfidf_sim) * 100, 2)}%
+🧠 **Transformer-based Similarity**: {round(transformer_sim * 100, 2)}%
+📝 **Content Length Change**: {text_growth}% {"📉 Reduced" if text_growth < 0 else "📈 Increased"}
+🎯 **LO Matches**: {sum(1 for score in new_scores if score > 0.5)} of {len(los)}
+📊 **Content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with learning outcomes.**
 """
+    return summary, df, fig, new_text[:2000] + "..."
+import gradio as gr
 iface = gr.Interface(
     fn=compare_all,
     inputs=[
         gr.File(label="Old Handout PDF", type='binary'),
         gr.File(label="New Handout PDF", type='binary'),
+        gr.File(label="Learning Outcomes (TXT)", type='binary')
     ],
     outputs=[
+        gr.Markdown(label="📋 Summary"),
         gr.Dataframe(label="📊 LO-wise Comparison Table"),
+        gr.Plot(label="📈 LO Match Chart"),
+        gr.Textbox(label="📝 Preview of New Content")
     ],
+    title="📘 AI Handout Comparator + LO Aligner",
+    description="Compare two versions of handouts using both TF-IDF and Transformers. Analyze changes in content, alignment with Learning Outcomes, and Bloom’s taxonomy level."
 )
 iface.launch()