Deevyankar commited on
Commit
bf6da39
Β·
verified Β·
1 Parent(s): 6524b79

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from PyPDF2 import PdfReader
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import matplotlib.pyplot as plt
8
+ import pandas as pd
9
+ import io
10
+
11
+ # Load sentence transformer model
12
+ model = SentenceTransformer('all-MiniLM-L6-v2')
13
+
14
+ def extract_text_from_pdf(pdf_file):
15
+ try:
16
+ reader = PdfReader(io.BytesIO(pdf_file))
17
+ full_text = ""
18
+ for page in reader.pages:
19
+ text = page.extract_text()
20
+ if text:
21
+ full_text += text
22
+ return full_text.strip()
23
+ except Exception as e:
24
+ return ""
25
+
26
+ def tfidf_similarity(text1, text2):
27
+ vectorizer = TfidfVectorizer()
28
+ tfidf = vectorizer.fit_transform([text1, text2])
29
+ return cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
30
+
31
+ def transformer_similarity(text1, text2):
32
+ emb1 = model.encode(text1, convert_to_tensor=True)
33
+ emb2 = model.encode(text2, convert_to_tensor=True)
34
+ return util.pytorch_cos_sim(emb1, emb2).item()
35
+
36
+ def compare_all(old_pdf, new_pdf, lo_file):
37
+ try:
38
+ los = lo_file.decode("utf-8", errors="ignore").splitlines()
39
+ los = [lo.strip() for lo in los if lo.strip()]
40
+ except:
41
+ return "❌ Could not read learning outcomes file.", None, None, None, None, None
42
+
43
+ old_text = extract_text_from_pdf(old_pdf)
44
+ new_text = extract_text_from_pdf(new_pdf)
45
+
46
+ if not old_text or not new_text:
47
+ return "❌ Could not extract text from one or both PDFs.", None, None, None, None, None
48
+
49
+ tfidf_sim = tfidf_similarity(old_text, new_text)
50
+ transformer_sim = transformer_similarity(old_text, new_text)
51
+ content_diff = abs(len(new_text) - len(old_text)) / max(len(old_text), 1) * 100
52
+
53
+ tfidf_summary = f"πŸ” **TF-IDF Similarity:** {round(tfidf_sim * 100, 2)}%"
54
+ trans_summary = f"πŸ€– **Transformer Similarity:** {round(transformer_sim * 100, 2)}%"
55
+ length_change = f"πŸ“„ **Text Length Difference:** {round(content_diff, 2)}%"
56
+
57
+ insights = f"{tfidf_summary}\n{trans_summary}\n{length_change}\n"
58
+
59
+ # LO-wise comparison
60
+ lo_scores = []
61
+ for lo in los:
62
+ lo_score = transformer_similarity(lo, new_text)
63
+ lo_scores.append(lo_score)
64
+
65
+ labels = [f"LO{i+1}" for i in range(len(los))]
66
+ df = pd.DataFrame({
67
+ "Learning Outcome": labels,
68
+ "Match Score (0-1)": [round(s, 2) for s in lo_scores]
69
+ })
70
+
71
+ # Chart
72
+ fig, ax = plt.subplots(figsize=(8, 4))
73
+ ax.bar(labels, lo_scores, color="skyblue")
74
+ ax.set_ylim(0, 1)
75
+ ax.set_ylabel("Semantic Match")
76
+ ax.set_title("LO-wise Transformer Similarity")
77
+ plt.xticks(rotation=45)
78
+
79
+ return insights, df, fig, new_text, tfidf_sim, transformer_sim
80
+
81
+ iface = gr.Interface(
82
+ fn=compare_all,
83
+ inputs=[
84
+ gr.File(label="Old Handout PDF", type="binary"),
85
+ gr.File(label="New Handout PDF", type="binary"),
86
+ gr.File(label="Learning Outcomes (TXT)", type="binary"),
87
+ ],
88
+ outputs=[
89
+ gr.Textbox(label="πŸ“˜ Summary of Analysis"),
90
+ gr.Dataframe(label="πŸ“Š LO-wise Semantic Comparison"),
91
+ gr.Plot(label="πŸ“ˆ LO Match Chart"),
92
+ gr.Textbox(label="πŸ“– New Handout Preview (Full Text)", lines=10, max_lines=20),
93
+ gr.Number(label="TF-IDF Similarity Score"),
94
+ gr.Number(label="Transformer Similarity Score"),
95
+ ],
96
+ title="πŸ“˜ Course Handout Comparison Tool",
97
+ description="Compare old and new handouts, analyze semantic change, LO alignment, and visualize Bloom's mapping."
98
+ )
99
+
100
+ iface.launch()