SorrelC commited on
Commit
e500892
·
verified ·
1 Parent(s): 3295b9f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -22
app.py CHANGED
@@ -1,3 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Create the Gradio interface
2
  def create_interface():
3
  with gr.Blocks(title="Keyword Tagging Tool", theme=gr.themes.Soft()) as demo:
@@ -92,24 +323,6 @@ def create_interface():
92
  with gr.Column(scale=1):
93
  example2_btn = gr.Button("Load Example 2", variant="secondary", size="sm")
94
 
95
- # Define example data
96
- example1_data = [
97
- "During World War II, many prisoners of war were held in camps across Europe. The Geneva Convention established rules for POW treatment. American soldiers and British troops were among those captured.",
98
- "Prisoner of War", "POW; POWs; prisoner of war",
99
- "World War II", "WWII; Second World War",
100
- "United States", "USA; US; America; American",
101
- "", "", "", ""
102
- ]
103
-
104
- example2_data = [
105
- "The University of Oxford is located in Oxford, England. Students from around the world study at this prestigious institution.",
106
- "University", "university; institution; college",
107
- "Oxford", "oxford",
108
- "England", "england; English",
109
- "Student", "student; students; pupils",
110
- "", ""
111
- ]
112
-
113
  # Clear functions
114
  def clear_dictionary_only():
115
  """Clear only the keyword dictionary fields"""
@@ -121,10 +334,23 @@ def create_interface():
121
 
122
  # Example loading functions
123
  def load_example1():
124
- return example1_data
 
 
 
 
 
 
125
 
126
  def load_example2():
127
- return example2_data
 
 
 
 
 
 
 
128
 
129
  # Button functions
130
  find_btn.click(
@@ -140,7 +366,7 @@ def create_interface():
140
 
141
  clear_all_btn.click(
142
  fn=clear_everything,
143
- outputs=[text_input, primary1, synonyms1, primary2, synonyms2, primary3, synonyms3, primary4, synonyms4, primary5, synonyms5, results_output, highlighted_output, copy_output]
144
  )
145
 
146
  example1_btn.click(
@@ -186,4 +412,8 @@ def create_interface():
186
  </div>
187
  """)
188
 
189
- return demo
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ import pandas as pd
4
+
5
+ def build_keywords_dict(primary_inputs, synonym_inputs):
6
+ """Build keyword dictionary from separate primary and synonym inputs"""
7
+ keywords_dict = {}
8
+
9
+ for primary, synonyms in zip(primary_inputs, synonym_inputs):
10
+ if primary and primary.strip(): # Only process if primary keyword exists
11
+ primary_clean = primary.strip()
12
+ if synonyms and synonyms.strip():
13
+ synonym_list = [s.strip() for s in synonyms.split(';') if s.strip()]
14
+ else:
15
+ synonym_list = []
16
+ keywords_dict[primary_clean] = synonym_list
17
+
18
+ return keywords_dict
19
+
20
+ def find_keywords(story, keywords_dict):
21
+ """Find keywords in the story text"""
22
+ if not story or not isinstance(story, str):
23
+ return ''
24
+
25
+ found_keywords = set()
26
+
27
+ # Search for each primary keyword and its synonyms
28
+ for primary_keyword, synonyms in keywords_dict.items():
29
+ keyword_group_found = False
30
+
31
+ # Check primary keyword
32
+ if primary_keyword.upper() == "US":
33
+ if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
34
+ keyword_group_found = True
35
+ else:
36
+ pattern = r'\b' + re.escape(primary_keyword) + r'\b'
37
+ if re.search(pattern, story, re.IGNORECASE):
38
+ keyword_group_found = True
39
+
40
+ # Check each synonym
41
+ for synonym in synonyms:
42
+ if synonym.upper() == "US":
43
+ if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
44
+ keyword_group_found = True
45
+ else:
46
+ if re.search(r'\b' + re.escape(synonym) + r'\b', story, re.IGNORECASE):
47
+ keyword_group_found = True
48
+
49
+ # If any keyword from this group was found, add ALL keywords from the group
50
+ if keyword_group_found:
51
+ found_keywords.add(primary_keyword) # Always include the primary
52
+ found_keywords.update(synonyms) # Add all synonyms
53
+
54
+ return '; '.join(sorted(found_keywords))
55
+
56
+ def highlight_keywords_in_text(text, keywords_list):
57
+ """Create HTML with highlighted keywords while preserving line breaks"""
58
+ if not keywords_list:
59
+ # Convert line breaks to HTML breaks for plain text
60
+ formatted_text = text.replace('\n', '<br>')
61
+ return formatted_text
62
+
63
+ highlighted_text = text
64
+ colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#F9CA24', '#6C5CE7', '#A0E7E5', '#FD79A8', '#55A3FF', '#00B894', '#E17055']
65
+
66
+ for i, keyword in enumerate(keywords_list):
67
+ if keyword:
68
+ color = colors[i % len(colors)]
69
+ pattern = r'\b' + re.escape(keyword) + r'\b'
70
+ replacement = f'<span style="background-color: {color}; padding: 2px 4px; border-radius: 3px; color: white; font-weight: bold;">{keyword}</span>'
71
+ highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
72
+
73
+ # Convert line breaks to HTML breaks after highlighting
74
+ highlighted_text = highlighted_text.replace('\n', '<br>')
75
+ return highlighted_text
76
+
77
+ def create_keyword_results_table(found_keywords_str, keywords_dict, input_text):
78
+ """Create HTML table showing detailed keyword results"""
79
+ if not found_keywords_str:
80
+ return "<p style='text-align: center; padding: 20px;'>No keywords found.</p>"
81
+
82
+ found_keywords = found_keywords_str.split('; ')
83
+
84
+ # Group keywords by their primary category
85
+ keyword_groups = {}
86
+ for primary, synonyms in keywords_dict.items():
87
+ found_in_group = []
88
+ # Check if primary keyword was found
89
+ if primary in found_keywords:
90
+ found_in_group.append(primary)
91
+ # Check if any synonyms were found
92
+ for synonym in synonyms:
93
+ if synonym in found_keywords:
94
+ found_in_group.append(synonym)
95
+
96
+ if found_in_group:
97
+ keyword_groups[primary] = found_in_group
98
+
99
+ if not keyword_groups:
100
+ return "<p style='text-align: center; padding: 20px;'>No keyword groups matched.</p>"
101
+
102
+ # Create the HTML table
103
+ table_html = """
104
+ <div style='max-height: 500px; overflow-y: auto; border: 2px solid #ddd; border-radius: 8px; padding: 20px; background-color: #fafafa; margin: 10px 0;'>
105
+ <h4 style='margin: 0 0 15px 0; color: #333;'>📊 Detailed Keyword Results</h4>
106
+ <table style="width: 100%; border-collapse: collapse; border: 1px solid #ddd; background-color: white;">
107
+ <thead>
108
+ <tr style="background-color: #6366f1; color: white;">
109
+ <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Primary Keyword</th>
110
+ <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Found Terms</th>
111
+ <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Count in Text</th>
112
+ <th style="padding: 12px; text-align: left; border: 1px solid #ddd;">Context Preview</th>
113
+ </tr>
114
+ </thead>
115
+ <tbody>
116
+ """
117
+
118
+ colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#F9CA24', '#6C5CE7', '#A0E7E5', '#FD79A8', '#55A3FF', '#00B894', '#E17055']
119
+
120
+ for i, (primary, found_terms) in enumerate(keyword_groups.items()):
121
+ color = colors[i % len(colors)]
122
+
123
+ # Count total occurrences and get context
124
+ total_count = 0
125
+ contexts = []
126
+
127
+ for term in found_terms:
128
+ # Count occurrences (case insensitive)
129
+ if term.upper() == "US":
130
+ # Special handling for "US"
131
+ count = len([m for m in re.finditer(r'\bUS\b', input_text)])
132
+ else:
133
+ pattern = r'\b' + re.escape(term) + r'\b'
134
+ count = len(list(re.finditer(pattern, input_text, re.IGNORECASE)))
135
+
136
+ total_count += count
137
+
138
+ # Get context (first occurrence)
139
+ if term.upper() == "US":
140
+ match = re.search(r'\bUS\b', input_text)
141
+ else:
142
+ match = re.search(r'\b' + re.escape(term) + r'\b', input_text, re.IGNORECASE)
143
+
144
+ if match:
145
+ start = max(0, match.start() - 30)
146
+ end = min(len(input_text), match.end() + 30)
147
+ context = input_text[start:end].replace('\n', ' ')
148
+ # Highlight the found term in context
149
+ if term.upper() == "US":
150
+ highlighted_context = re.sub(
151
+ r'\bUS\b',
152
+ f'<strong style="background-color: {color}; color: white; padding: 1px 2px; border-radius: 2px;">{term}</strong>',
153
+ context
154
+ )
155
+ else:
156
+ highlighted_context = re.sub(
157
+ r'\b' + re.escape(term) + r'\b',
158
+ f'<strong style="background-color: {color}; color: white; padding: 1px 2px; border-radius: 2px;">{term}</strong>',
159
+ context,
160
+ flags=re.IGNORECASE
161
+ )
162
+ contexts.append(highlighted_context)
163
+
164
+ # Create found terms display
165
+ found_terms_display = []
166
+ for term in found_terms:
167
+ found_terms_display.append(f'<span style="background-color: {color}; color: white; padding: 2px 6px; border-radius: 10px; font-size: 12px; margin: 1px;">{term}</span>')
168
+
169
+ table_html += f"""
170
+ <tr style="background-color: #fff;">
171
+ <td style="padding: 10px; border: 1px solid #ddd; font-weight: bold;">{primary}</td>
172
+ <td style="padding: 10px; border: 1px solid #ddd;">{' '.join(found_terms_display)}</td>
173
+ <td style="padding: 10px; border: 1px solid #ddd; text-align: center;">
174
+ <span style='background-color: #28a745; color: white; padding: 4px 8px; border-radius: 12px; font-weight: bold;'>
175
+ {total_count}
176
+ </span>
177
+ </td>
178
+ <td style="padding: 10px; border: 1px solid #ddd; font-style: italic; font-size: 14px;">
179
+ {contexts[0] if contexts else 'No context available'}...
180
+ </td>
181
+ </tr>
182
+ """
183
+
184
+ table_html += """
185
+ </tbody>
186
+ </table>
187
+ </div>
188
+ """
189
+
190
+ return table_html
191
+
192
+ def process_text(input_text, primary1, synonyms1, primary2, synonyms2, primary3, synonyms3, primary4, synonyms4, primary5, synonyms5):
193
+ """Main processing function with added results table"""
194
+ if not input_text.strip():
195
+ return "Please enter some text to analyze", "", "", "No keywords found"
196
+
197
+ # Build keywords dictionary from separate inputs
198
+ primary_inputs = [primary1, primary2, primary3, primary4, primary5]
199
+ synonym_inputs = [synonyms1, synonyms2, synonyms3, synonyms4, synonyms5]
200
+ keywords_dict = build_keywords_dict(primary_inputs, synonym_inputs)
201
+
202
+ if not keywords_dict:
203
+ return "Please enter at least one primary keyword", "", "", "No keyword dictionary provided"
204
+
205
+ # Find keywords in the text
206
+ found_keywords_str = find_keywords(input_text, keywords_dict)
207
+
208
+ if not found_keywords_str:
209
+ return f"No keywords found in the text.\n\nKeyword dictionary loaded: {len(keywords_dict)} primary keywords", input_text, "", "No matches found"
210
+
211
+ # Create highlighted version
212
+ keywords_list = found_keywords_str.split('; ')
213
+ highlighted_html = highlight_keywords_in_text(input_text, keywords_list)
214
+
215
+ # Create results table
216
+ results_table_html = create_keyword_results_table(found_keywords_str, keywords_dict, input_text)
217
+
218
+ # Create results summary
219
+ results_summary = f"""
220
+ ## Results Summary
221
+ **Keywords Found:** {len(keywords_list)}
222
+ **Matched Keywords:** {found_keywords_str}
223
+ **Keyword Dictionary Stats:**
224
+ - Primary keywords loaded: {len(keywords_dict)}
225
+ - Total searchable terms: {sum(len(synonyms) + 1 for synonyms in keywords_dict.values())}
226
+ **Copy this result to your spreadsheet:**
227
+ {found_keywords_str}
228
+ """
229
+
230
+ return results_summary, highlighted_html, results_table_html, found_keywords_str
231
+
232
  # Create the Gradio interface
233
  def create_interface():
234
  with gr.Blocks(title="Keyword Tagging Tool", theme=gr.themes.Soft()) as demo:
 
323
  with gr.Column(scale=1):
324
  example2_btn = gr.Button("Load Example 2", variant="secondary", size="sm")
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  # Clear functions
327
  def clear_dictionary_only():
328
  """Clear only the keyword dictionary fields"""
 
334
 
335
  # Example loading functions
336
  def load_example1():
337
+ return [
338
+ "During World War II, many prisoners of war were held in camps across Europe. The Geneva Convention established rules for POW treatment. American soldiers and British troops were among those captured.",
339
+ "Prisoner of War", "POW; POWs; prisoner of war",
340
+ "World War II", "WWII; Second World War",
341
+ "United States", "USA; US; America; American",
342
+ "", "", "", ""
343
+ ]
344
 
345
  def load_example2():
346
+ return [
347
+ "The University of Oxford is located in Oxford, England. Students from around the world study at this prestigious institution.",
348
+ "University", "university; institution; college",
349
+ "Oxford", "oxford",
350
+ "England", "england; English",
351
+ "Student", "student; students; pupils",
352
+ "", ""
353
+ ]
354
 
355
  # Button functions
356
  find_btn.click(
 
366
 
367
  clear_all_btn.click(
368
  fn=clear_everything,
369
+ outputs=[text_input, primary1, synonyms1, primary2, synonyms2, primary3, synonyms3, primary4, synonyms4, primary5, synonyms5, results_output, highlighted_output, results_table_output, copy_output]
370
  )
371
 
372
  example1_btn.click(
 
412
  </div>
413
  """)
414
 
415
+ return demo
416
+
417
+ if __name__ == "__main__":
418
+ demo = create_interface()
419
+ demo.launch()