Azidan commited on
Commit
d8547ca
Β·
verified Β·
1 Parent(s): 6a4c4d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -11
app.py CHANGED
@@ -28,8 +28,6 @@ SUBJECT_TIPS = {
28
  "Focus on understanding formulas and when to apply them.",
29
  "Work backwards from answers to see common mistake patterns."
30
  ],
31
- "equation": SUBJECT_TIPS["math"] if "math" not in SUBJECT_TIPS else [], # alias
32
- "formula": SUBJECT_TIPS["math"],
33
  "physics": [
34
  "Draw free-body diagrams or sketch scenarios to visualize forces/concepts.",
35
  "Practice unit conversions and dimensional analysis first.",
@@ -55,9 +53,14 @@ SUBJECT_TIPS = {
55
  "Compare this text to others you've read.",
56
  "Practice essay-style answers: thesis + evidence + analysis."
57
  ],
58
- # Add more categories as needed: economics, programming, law, etc.
59
  }
60
 
 
 
 
 
 
 
61
  GENERAL_TIPS = [
62
  "Use **Active Recall**: Cover the summary and explain key points out loud or in writing.",
63
  "Apply **Spaced Repetition**: Review today, in 2–3 days, then in a week (try Anki).",
@@ -70,6 +73,7 @@ GENERAL_TIPS = [
70
  # Utilities
71
  # =========================
72
  def clean_text(text: str) -> str:
 
73
  text = text.replace("β€˜", "'").replace("’", "'")
74
  text = text.replace("β€œ", '"').replace("”", '"')
75
  text = re.sub(r"[.]{2,}", ".", text)
@@ -87,6 +91,7 @@ def clean_text(text: str) -> str:
87
 
88
 
89
  def chunk_text(text: str):
 
90
  tokens = tokenizer.encode(text, add_special_tokens=False)
91
  chunks = []
92
  for i in range(0, len(tokens), CHUNK_SIZE):
@@ -97,11 +102,15 @@ def chunk_text(text: str):
97
 
98
 
99
  def get_simple_keywords(summary: str, top_n=15):
100
- """Very basic keyword extraction: most frequent words (after removing stop/punct)."""
101
  text = summary.lower()
102
  text = text.translate(str.maketrans("", "", string.punctuation))
103
  words = text.split()
104
- stop_words = {"the", "a", "an", "and", "or", "but", "is", "are", "was", "were", "this", "that", "these", "those", "in", "on", "at", "to", "of", "for", "with", "by", "from", "as", "it", "its"}
 
 
 
 
105
  filtered = [w for w in words if w not in stop_words and len(w) > 2]
106
  counter = Counter(filtered)
107
  return [word for word, _ in counter.most_common(top_n)]
@@ -116,11 +125,11 @@ def generate_dynamic_advice(summary: str):
116
  for word in keywords:
117
  for category, tips in SUBJECT_TIPS.items():
118
  if category in word and category not in seen_categories:
119
- detected_tips.extend(tips[:2]) # take up to 2 per category
120
  seen_categories.add(category)
121
 
122
- # Always add 3–4 general ones
123
- selected_general = GENERAL_TIPS[:4] # or random.sample if you import random
124
 
125
  all_tips = detected_tips + selected_general
126
 
@@ -131,12 +140,13 @@ def generate_dynamic_advice(summary: str):
131
  for tip in all_tips:
132
  advice_md += f"- {tip}\n"
133
 
134
- advice_md += "\n**Pro tip**: Rewrite the summary in your own words after 24 hours β€” this locks in understanding!\n"
135
 
136
  return advice_md
137
 
138
 
139
  def summarize_long_text(text: str) -> str:
 
140
  if not text or len(text.strip()) == 0:
141
  return "No text provided."
142
 
@@ -155,13 +165,14 @@ def summarize_long_text(text: str) -> str:
155
  merged = " ".join(summaries)
156
  cleaned_summary = clean_text(merged)
157
 
158
- # Dynamic advice
159
  dynamic_advice = generate_dynamic_advice(cleaned_summary)
160
 
161
  return cleaned_summary + dynamic_advice
162
 
163
 
164
  def read_pdf(file) -> str:
 
165
  try:
166
  reader = PdfReader(file)
167
  pages = [page.extract_text() or "" for page in reader.pages]
@@ -188,7 +199,7 @@ with gr.Blocks() as demo:
188
  "β€’ Handles **thousands of words**\n"
189
  "β€’ Supports **PDF upload**\n"
190
  "β€’ Optimized for **CPU / free tier**\n"
191
- "β€’ Includes **general + dynamic study tips** tailored to content keywords"
192
  )
193
 
194
  text_input = gr.Textbox(
 
28
  "Focus on understanding formulas and when to apply them.",
29
  "Work backwards from answers to see common mistake patterns."
30
  ],
 
 
31
  "physics": [
32
  "Draw free-body diagrams or sketch scenarios to visualize forces/concepts.",
33
  "Practice unit conversions and dimensional analysis first.",
 
53
  "Compare this text to others you've read.",
54
  "Practice essay-style answers: thesis + evidence + analysis."
55
  ],
 
56
  }
57
 
58
+ # Add aliases safely AFTER the dictionary is fully defined
59
+ SUBJECT_TIPS["equation"] = SUBJECT_TIPS["math"]
60
+ SUBJECT_TIPS["formula"] = SUBJECT_TIPS["math"]
61
+ # You can easily add more: SUBJECT_TIPS["calculus"] = SUBJECT_TIPS["math"]
62
+ # SUBJECT_TIPS["algebra"] = SUBJECT_TIPS["math"] etc.
63
+
64
  GENERAL_TIPS = [
65
  "Use **Active Recall**: Cover the summary and explain key points out loud or in writing.",
66
  "Apply **Spaced Repetition**: Review today, in 2–3 days, then in a week (try Anki).",
 
73
  # Utilities
74
  # =========================
75
  def clean_text(text: str) -> str:
76
+ """Fix quotes, spacing, repetition, and broken punctuation."""
77
  text = text.replace("β€˜", "'").replace("’", "'")
78
  text = text.replace("β€œ", '"').replace("”", '"')
79
  text = re.sub(r"[.]{2,}", ".", text)
 
91
 
92
 
93
  def chunk_text(text: str):
94
+ """Token-aware chunking to avoid model overflow."""
95
  tokens = tokenizer.encode(text, add_special_tokens=False)
96
  chunks = []
97
  for i in range(0, len(tokens), CHUNK_SIZE):
 
102
 
103
 
104
  def get_simple_keywords(summary: str, top_n=15):
105
+ """Very basic keyword extraction: most frequent meaningful words."""
106
  text = summary.lower()
107
  text = text.translate(str.maketrans("", "", string.punctuation))
108
  words = text.split()
109
+ stop_words = {
110
+ "the", "a", "an", "and", "or", "but", "is", "are", "was", "were",
111
+ "this", "that", "these", "those", "in", "on", "at", "to", "of",
112
+ "for", "with", "by", "from", "as", "it", "its", "be", "have", "has"
113
+ }
114
  filtered = [w for w in words if w not in stop_words and len(w) > 2]
115
  counter = Counter(filtered)
116
  return [word for word, _ in counter.most_common(top_n)]
 
125
  for word in keywords:
126
  for category, tips in SUBJECT_TIPS.items():
127
  if category in word and category not in seen_categories:
128
+ detected_tips.extend(tips[:2]) # max 2 tips per matched category
129
  seen_categories.add(category)
130
 
131
+ # Always include some general advice
132
+ selected_general = GENERAL_TIPS[:4]
133
 
134
  all_tips = detected_tips + selected_general
135
 
 
140
  for tip in all_tips:
141
  advice_md += f"- {tip}\n"
142
 
143
+ advice_md += "\n**Pro tip**: Try rewriting the main ideas in your own words after 24 hours β€” it really helps long-term retention!\n"
144
 
145
  return advice_md
146
 
147
 
148
  def summarize_long_text(text: str) -> str:
149
+ """Summarize arbitrarily long text safely + add study advice."""
150
  if not text or len(text.strip()) == 0:
151
  return "No text provided."
152
 
 
165
  merged = " ".join(summaries)
166
  cleaned_summary = clean_text(merged)
167
 
168
+ # Generate dynamic study advice
169
  dynamic_advice = generate_dynamic_advice(cleaned_summary)
170
 
171
  return cleaned_summary + dynamic_advice
172
 
173
 
174
  def read_pdf(file) -> str:
175
+ """Safely extract text from PDF."""
176
  try:
177
  reader = PdfReader(file)
178
  pages = [page.extract_text() or "" for page in reader.pages]
 
199
  "β€’ Handles **thousands of words**\n"
200
  "β€’ Supports **PDF upload**\n"
201
  "β€’ Optimized for **CPU / free tier**\n"
202
+ "β€’ Includes **general + dynamic study tips** based on content keywords"
203
  )
204
 
205
  text_input = gr.Textbox(