Deevyankar commited on
Commit
5abbe67
Β·
verified Β·
1 Parent(s): d83eb1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -12
app.py CHANGED
@@ -3,8 +3,10 @@ import gradio as gr
3
  import fitz # PyMuPDF
4
  from docx import Document
5
  import io
6
- import difflib
7
  import re
 
 
 
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
 
@@ -49,13 +51,15 @@ def quality_check(new_text):
49
  return "πŸ”΄ New content may need more detail."
50
 
51
  def find_relevant_los(content, los):
 
 
52
  vectorizer = TfidfVectorizer().fit_transform([content] + los)
53
  similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
54
  matched = []
55
  for i, score in enumerate(similarities):
56
  if score > 0.2:
57
  matched.append(f"βœ“ {los[i]} (Match: {score:.2f})")
58
- return matched if matched else ["No significant LO matches found."]
59
 
60
  def summarize_added_lines(old_text, new_text):
61
  old_lines = set(old_text.splitlines())
@@ -68,7 +72,25 @@ def summarize_added_lines(old_text, new_text):
68
  if len(line_clean.split()) >= 5:
69
  summary.append("- " + line_clean)
70
 
71
- return summary if summary else ["No major content additions found."]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  def compare_handouts(old_pdf, new_pdf, lo_file):
74
  old_text = extract_text_from_pdf(old_pdf)
@@ -76,16 +98,24 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
76
  los = extract_los(lo_file)
77
 
78
  if not old_text or not new_text:
79
- return "❗ One or both PDFs may not contain extractable text.", "", "", ""
80
 
81
- added_summary = summarize_added_lines(old_text, new_text)
82
- lo_matches = find_relevant_los(new_text, los)
 
83
  quality = quality_check(new_text)
84
 
85
  summary_output = "\n".join(added_summary)
86
  lo_output = "\n".join(lo_matches)
87
 
88
- return summary_output, lo_output, quality, "βœ… Use this summary to generate heatmaps externally."
 
 
 
 
 
 
 
89
 
90
  iface = gr.Interface(
91
  fn=compare_handouts,
@@ -96,12 +126,12 @@ iface = gr.Interface(
96
  ],
97
  outputs=[
98
  gr.Textbox(label="πŸ†• New Content Summary", lines=10),
99
- gr.Textbox(label="🎯 LO Matches (Relevance to New)", lines=10),
100
- gr.Textbox(label="πŸ“ˆ Content Quality Insight", lines=2),
101
- gr.Textbox(label="🌈 Visual Heatmap Prep", lines=2)
102
  ],
103
- title="πŸ“˜ Smart Handout Analyzer: Content, LO & Quality",
104
- description="This tool finds new content in the updated PDF, matches it to learning outcomes (LOs), and estimates content quality. Heatmap-ready output included."
105
  )
106
 
107
  iface.launch()
 
3
  import fitz # PyMuPDF
4
  from docx import Document
5
  import io
 
6
  import re
7
+ import difflib
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
  from sklearn.feature_extraction.text import TfidfVectorizer
11
  from sklearn.metrics.pairwise import cosine_similarity
12
 
 
51
  return "πŸ”΄ New content may need more detail."
52
 
53
  def find_relevant_los(content, los):
54
+ if not los:
55
+ return [], 0
56
  vectorizer = TfidfVectorizer().fit_transform([content] + los)
57
  similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
58
  matched = []
59
  for i, score in enumerate(similarities):
60
  if score > 0.2:
61
  matched.append(f"βœ“ {los[i]} (Match: {score:.2f})")
62
+ return matched, len(matched)
63
 
64
  def summarize_added_lines(old_text, new_text):
65
  old_lines = set(old_text.splitlines())
 
72
  if len(line_clean.split()) >= 5:
73
  summary.append("- " + line_clean)
74
 
75
+ return summary, len(added_lines), len(new_lines)
76
+
77
+ def create_heatmap(old_text, new_text):
78
+ old_lines = old_text.splitlines()
79
+ new_lines = new_text.splitlines()
80
+ sm = difflib.SequenceMatcher(None, old_lines, new_lines)
81
+ diff_matrix = np.zeros((len(new_lines), 1))
82
+
83
+ for opcode, i1, i2, j1, j2 in sm.get_opcodes():
84
+ if opcode == 'insert':
85
+ for j in range(j1, j2):
86
+ if j < len(diff_matrix):
87
+ diff_matrix[j][0] = 1 # Mark added lines
88
+
89
+ fig, ax = plt.subplots(figsize=(2, len(new_lines) * 0.2))
90
+ ax.imshow(diff_matrix, cmap="Reds", aspect='auto')
91
+ ax.axis('off')
92
+ fig.tight_layout()
93
+ return fig
94
 
95
  def compare_handouts(old_pdf, new_pdf, lo_file):
96
  old_text = extract_text_from_pdf(old_pdf)
 
98
  los = extract_los(lo_file)
99
 
100
  if not old_text or not new_text:
101
+ return "❗ Error in file(s)", "", "", "", None
102
 
103
+ added_summary, added_lines, total_lines = summarize_added_lines(old_text, new_text)
104
+ percent_change = (added_lines / max(total_lines, 1)) * 100
105
+ lo_matches, matched_count = find_relevant_los(new_text, los)
106
  quality = quality_check(new_text)
107
 
108
  summary_output = "\n".join(added_summary)
109
  lo_output = "\n".join(lo_matches)
110
 
111
+ stats = (
112
+ f"πŸ“ˆ Percent Change in Content: {percent_change:.2f}%\n"
113
+ f"🎯 Matched Learning Outcomes: {matched_count} out of {len(los)}\n"
114
+ f"{quality}"
115
+ )
116
+
117
+ heatmap_fig = create_heatmap(old_text, new_text)
118
+ return summary_output, lo_output, stats, heatmap_fig
119
 
120
  iface = gr.Interface(
121
  fn=compare_handouts,
 
126
  ],
127
  outputs=[
128
  gr.Textbox(label="πŸ†• New Content Summary", lines=10),
129
+ gr.Textbox(label="🎯 LO Matches", lines=10),
130
+ gr.Textbox(label="πŸ“Š Stats & Quality", lines=5),
131
+ gr.Plot(label="🌈 Visual Heatmap of Changes")
132
  ],
133
+ title="πŸ“˜ Smart Handout Analyzer: Percent Change, LOs & Quality",
134
+ description="Upload old & new PDFs plus Learning Outcomes to see content updates, LO match, quality insights, and a heatmap of added content."
135
  )
136
 
137
  iface.launch()