Deevyankar commited on
Commit
17240fa
Β·
verified Β·
1 Parent(s): 107339c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from PyPDF2 import PdfReader
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from transformers import pipeline
7
+ import matplotlib.pyplot as plt
8
+ import pandas as pd
9
+ import io
10
+
11
+ semantic_pipeline = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
12
+
13
+ def extract_text_from_pdf(pdf_file):
14
+ try:
15
+ reader = PdfReader(io.BytesIO(pdf_file))
16
+ full_text = ""
17
+ for page in reader.pages:
18
+ text = page.extract_text()
19
+ if text:
20
+ full_text += text
21
+ return full_text.strip()
22
+ except Exception as e:
23
+ print("Error reading PDF:", e)
24
+ return ""
25
+
26
+ def tfidf_similarity(text1, text2):
27
+ vec = TfidfVectorizer().fit_transform([text1, text2])
28
+ return cosine_similarity(vec[0:1], vec[1:2])[0][0]
29
+
30
+ def transformer_similarity(text1, text2):
31
+ emb1 = semantic_pipeline(text1[:512])[0]
32
+ emb2 = semantic_pipeline(text2[:512])[0]
33
+ emb1_avg = [sum(x)/len(x) for x in zip(*emb1)]
34
+ emb2_avg = [sum(x)/len(x) for x in zip(*emb2)]
35
+ return cosine_similarity([emb1_avg], [emb2_avg])[0][0]
36
+
37
+ def semantic_match(lo_list, content):
38
+ vectorizer = TfidfVectorizer().fit_transform([content] + lo_list)
39
+ vectors = vectorizer.toarray()
40
+ content_vec = vectors[0]
41
+ scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
42
+ return scores
43
+
44
+ def compare_all(old_pdf, new_pdf, lo_file):
45
+ try:
46
+ los = lo_file.decode("utf-8", errors="ignore").splitlines()
47
+ los = [lo.strip() for lo in los if lo.strip()]
48
+ except:
49
+ return "❌ Could not read learning outcomes file.", None, None, None, None, None
50
+
51
+ old_text = extract_text_from_pdf(old_pdf)
52
+ new_text = extract_text_from_pdf(new_pdf)
53
+
54
+ if not old_text or not new_text:
55
+ return "❌ Could not extract text from one or both PDFs.", None, None, None, None, None
56
+
57
+ old_scores = semantic_match(los, old_text)
58
+ new_scores = semantic_match(los, new_text)
59
+
60
+ labels = [f"LO{i+1}" for i in range(len(los))]
61
+ x = range(len(labels))
62
+ fig, ax = plt.subplots()
63
+ ax.bar(x, old_scores, width=0.4, label="Old", align='center')
64
+ ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
65
+ ax.set_xticks([i + 0.2 for i in x])
66
+ ax.set_xticklabels(labels, rotation=45)
67
+ ax.set_ylabel("Semantic Match Score")
68
+ ax.set_title("Learning Outcomes Comparison")
69
+ ax.legend()
70
+
71
+ df = pd.DataFrame({
72
+ "Learning Outcome": labels,
73
+ "Old Match (%)": [round(s*100, 2) for s in old_scores],
74
+ "New Match (%)": [round(s*100, 2) for s in new_scores],
75
+ "Change (%)": [round((n - o)*100, 2) for o, n in zip(old_scores, new_scores)],
76
+ })
77
+
78
+ tfidf_sim = tfidf_similarity(old_text, new_text)
79
+ transformer_sim = transformer_similarity(old_text, new_text)
80
+ text_growth = (len(new_text) - len(old_text)) / len(old_text) * 100
81
+
82
+ summary = f"πŸ“˜ **Summary of Comparison**
83
+
84
+ "
85
+ summary += f"πŸ“ˆ **TF-IDF Content Similarity**: {round(tfidf_sim * 100, 2)}%
86
+ "
87
+ summary += f"πŸ€– **Transformer-based Similarity**: {round(transformer_sim * 100, 2)}%
88
+ "
89
+ summary += f"πŸ“ **Text Growth**: {'+' if text_growth >= 0 else ''}{round(text_growth, 2)}% more content in new handout
90
+
91
+ "
92
+ summary += f"🎯 **LOs Matched (New β‰₯ 0.5)**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
93
+ "
94
+ summary += f"πŸ“Š **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with outcomes.
95
+ "
96
+
97
+ explanation = (
98
+ "
99
+
100
+ ---
101
+
102
+ "
103
+ "πŸ“š **Explanation of Methods**:
104
+ "
105
+ "- **TF-IDF Similarity** checks how often important words appear in both documents. It gives a quick idea of textual overlap.
106
+ "
107
+ "- **Transformer Similarity** uses AI to understand meaning beyond words. It compares the 'sense' of the documents like a human would.
108
+ "
109
+ )
110
+
111
+ return summary + explanation, df, fig, new_text
112
+
113
+ iface = gr.Interface(
114
+ fn=compare_all,
115
+ inputs=[
116
+ gr.File(label="Old Handout PDF", type='binary'),
117
+ gr.File(label="New Handout PDF", type='binary'),
118
+ gr.File(label="Learning Outcomes (Text File)", type='binary'),
119
+ ],
120
+ outputs=[
121
+ gr.Markdown(label="πŸ“˜ Summary & Insights"),
122
+ gr.Dataframe(label="πŸ“Š LO-wise Comparison Table"),
123
+ gr.Plot(label="πŸ“ˆ Visual LO Change Chart"),
124
+ gr.Textbox(label="πŸ“„ New Handout Preview (Full Text)", lines=20, interactive=False),
125
+ ],
126
+ title="πŸ“˜ Handout Comparison + LO Semantic Analysis",
127
+ description="Upload two handouts (old and new) and a text file of Learning Outcomes (LOs). This tool compares content using TF-IDF and Transformers, visualizes LO changes, and explains results in simple terms.",
128
+ )
129
+
130
+ iface.launch()