SorrelC commited on
Commit
afafadc
·
verified ·
1 Parent(s): 2c32b13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -249
app.py CHANGED
@@ -1,253 +1,13 @@
1
  import gradio as gr
2
- import re
3
- import pandas as pd
4
 
5
- def parse_keywords_dict(keywords_text):
6
- """Parse the keyword dictionary from text input"""
7
- keywords_dict = {}
8
-
9
- if not keywords_text.strip():
10
- return keywords_dict
11
-
12
- lines = keywords_text.strip().split('\n')
13
- for line in lines:
14
- line = line.strip()
15
- if not line or '|' not in line:
16
- continue
17
-
18
- try:
19
- primary, secondary = line.split('|', 1)
20
- primary = primary.strip()
21
- secondary_list = [keyword.strip() for keyword in secondary.split(';') if keyword.strip()]
22
- keywords_dict[primary] = secondary_list
23
- except:
24
- continue
25
-
26
- return keywords_dict
27
 
28
- def find_keywords(story, keywords_dict):
29
- """Find keywords in the story text"""
30
- if not story or not isinstance(story, str):
31
- return ''
32
-
33
- found_keywords = set()
34
-
35
- # Search for each primary keyword and its synonyms
36
- for primary_keyword, synonyms in keywords_dict.items():
37
- # Special handling for 'US' to avoid matching 'us'
38
- if primary_keyword.upper() == "US":
39
- if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
40
- found_keywords.update(synonyms)
41
- else:
42
- pattern = r'\b' + re.escape(primary_keyword) + r'\b'
43
- if re.search(pattern, story, re.IGNORECASE):
44
- found_keywords.update(synonyms)
45
-
46
- # Check each secondary keyword independently
47
- for synonym in synonyms:
48
- if synonym.upper() == "US":
49
- if ' US ' in story or story.startswith('US ') or story.endswith(' US'):
50
- found_keywords.add(synonym)
51
- else:
52
- if re.search(r'\b' + re.escape(synonym) + r'\b', story, re.IGNORECASE):
53
- found_keywords.add(synonym)
54
-
55
- return '; '.join(sorted(found_keywords))
56
 
57
- def highlight_keywords_in_text(text, keywords_list):
58
- """Create HTML with highlighted keywords"""
59
- if not keywords_list:
60
- return text
61
-
62
- highlighted_text = text
63
- colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#F9CA24', '#6C5CE7', '#A0E7E5', '#FD79A8', '#55A3FF', '#00B894', '#E17055']
64
-
65
- for i, keyword in enumerate(keywords_list):
66
- if keyword:
67
- color = colors[i % len(colors)]
68
- pattern = r'\b' + re.escape(keyword) + r'\b'
69
- replacement = f'<span style="background-color: {color}; padding: 2px 4px; border-radius: 3px; color: white; font-weight: bold;">{keyword}</span>'
70
- highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
71
-
72
- return highlighted_text
73
-
74
- def process_text(input_text, keywords_text):
75
- """Main processing function"""
76
- if not input_text.strip():
77
- return "Please enter some text to analyze", "", "No keywords found"
78
-
79
- if not keywords_text.strip():
80
- return "Please enter keyword mappings", "", "No keyword dictionary provided"
81
-
82
- # Parse keywords dictionary
83
- keywords_dict = parse_keywords_dict(keywords_text)
84
-
85
- if not keywords_dict:
86
- return "Invalid keyword format. Please check your keyword dictionary format.", "", "Error parsing keywords"
87
-
88
- # Find keywords in the text
89
- found_keywords_str = find_keywords(input_text, keywords_dict)
90
-
91
- if not found_keywords_str:
92
- return f"No keywords found in the text.\n\nKeyword dictionary loaded: {len(keywords_dict)} primary keywords", input_text, "No matches found"
93
-
94
- # Create highlighted version
95
- keywords_list = found_keywords_str.split('; ')
96
- highlighted_html = highlight_keywords_in_text(input_text, keywords_list)
97
-
98
- # Create results summary
99
- results_summary = f"""
100
- ## Results Summary
101
-
102
- **Keywords Found:** {len(keywords_list)}
103
- **Matched Keywords:** {found_keywords_str}
104
-
105
- **Keyword Dictionary Stats:**
106
- - Primary keywords loaded: {len(keywords_dict)}
107
- - Total searchable terms: {sum(len(synonyms) + 1 for synonyms in keywords_dict.values())}
108
-
109
- **Copy this result to your spreadsheet:**
110
- {found_keywords_str}
111
- """
112
-
113
- return results_summary, highlighted_html, found_keywords_str
114
-
115
- # Create the Gradio interface
116
- def create_interface():
117
- with gr.Blocks(title="Keyword Tagging Tool", theme=gr.themes.Soft()) as demo:
118
- gr.Markdown("""
119
- # Keyword Tagging Tool
120
-
121
- This tool matches text against a controlled vocabulary and returns all associated keywords.
122
- Based on the keyword matching logic used for digital humanities research.
123
-
124
- ## How to use:
125
- 1. **Enter your text** in the left panel
126
- 2. **Define your keyword dictionary** in the right panel using the format: `Primary Keyword | synonym1; synonym2; synonym3`
127
- 3. **Click "Find Keywords"** to see results
128
- 4. **Copy the results** to paste into your spreadsheet
129
- """)
130
-
131
- with gr.Row():
132
- with gr.Column(scale=1):
133
- text_input = gr.Textbox(
134
- label="Text to Analyze",
135
- placeholder="Enter the text you want to tag with keywords...",
136
- lines=15,
137
- max_lines=20
138
- )
139
-
140
- with gr.Column(scale=1):
141
- keywords_input = gr.Textbox(
142
- label="Keyword Dictionary",
143
- placeholder="""Enter one mapping per line:
144
- Primary Keyword | synonym1; synonym2; synonym3
145
-
146
- Example:
147
- Prisoner of War | Prisoner of War; POW; POWs
148
- United States | United States; USA; US; America
149
- """,
150
- lines=15,
151
- max_lines=20
152
- )
153
-
154
- with gr.Row():
155
- find_btn = gr.Button("Find Keywords", variant="primary", size="lg")
156
- clear_btn = gr.Button("Clear All", size="lg")
157
-
158
- with gr.Row():
159
- results_output = gr.Markdown(label="Results Summary")
160
-
161
- with gr.Row():
162
- highlighted_output = gr.HTML(label="Text with Highlighted Keywords")
163
-
164
- with gr.Row():
165
- copy_output = gr.Textbox(
166
- label="Keywords for Spreadsheet (copy this text)",
167
- lines=3,
168
- max_lines=5
169
- )
170
-
171
- # Examples section
172
- gr.Markdown("### Examples")
173
- gr.Examples(
174
- examples=[
175
- [
176
- "During World War II, many prisoners of war were held in camps across Europe. The Geneva Convention established rules for POW treatment. American soldiers and British troops were among those captured.",
177
- """Prisoner of War | Prisoner of War; POW; POWs; prisoner of war
178
- World War II | World War II; WWII; Second World War
179
- United States | United States; USA; US; America; American
180
- United Kingdom | United Kingdom; UK; Britain; British"""
181
- ],
182
- [
183
- "The University of Oxford is located in Oxford, England. Students from around the world study at this prestigious institution. The university has many colleges including Christ Church and Magdalen College.",
184
- """University | University; university; institution; college
185
- Oxford | Oxford; oxford
186
- England | England; england; English
187
- Student | Student; student; students; pupils"""
188
- ],
189
- [
190
- "Shakespeare wrote many famous plays including Hamlet, Romeo and Juliet, and Macbeth. These works are performed in theatres worldwide and studied in schools.",
191
- """William Shakespeare | Shakespeare; william shakespeare; playwright
192
- Theatre | Theatre; theater; stage; performance
193
- Play | Play; plays; drama; theatrical work
194
- School | School; schools; education; academic"""
195
- ]
196
- ],
197
- inputs=[text_input, keywords_input],
198
- label="Click an example to try it out"
199
- )
200
-
201
- # Button functions
202
- find_btn.click(
203
- fn=process_text,
204
- inputs=[text_input, keywords_input],
205
- outputs=[results_output, highlighted_output, copy_output]
206
- )
207
-
208
- def clear_all():
209
- return "", "", "", "", ""
210
-
211
- clear_btn.click(
212
- fn=clear_all,
213
- outputs=[text_input, keywords_input, results_output, highlighted_output, copy_output]
214
- )
215
-
216
- # Instructions
217
- gr.Markdown("""
218
- ## Format Guide
219
-
220
- **Keyword Dictionary Format:**
221
- - One primary keyword per line
222
- - Use the pipe symbol `|` to separate primary keyword from synonyms
223
- - Use semicolons `;` to separate multiple synonyms
224
- - Case-insensitive matching (except for special cases like "US")
225
-
226
- **Special Handling:**
227
- - "US" is matched exactly to avoid confusion with the word "us"
228
- - Word boundaries are respected (prevents partial matches)
229
- - Results are alphabetized and deduplicated
230
-
231
- **Example Dictionary Entry:**
232
- ```
233
- Prisoner of War | Prisoner of War; POW; POWs; prisoner of war
234
- ```
235
-
236
- This will find any occurrence of "Prisoner of War", "POW", "POWs", or "prisoner of war" in your text and return all the associated terms.
237
- """)
238
-
239
- # Footer
240
- gr.HTML("""
241
- <div style="margin-top: 40px; padding: 20px; background-color: #f8f9fa; border-radius: 8px; text-align: center;">
242
- <p style="margin: 0; color: #666;">
243
- Created for digital humanities keyword tagging workflows.
244
- Based on controlled vocabulary matching principles.
245
- </p>
246
- </div>
247
- """)
248
-
249
- return demo
250
-
251
- if __name__ == "__main__":
252
- demo = create_interface()
253
- demo.launch()
 
1
  import gradio as gr
 
 
2
 
3
+ def simple_test(text):
4
+ return f"You entered: {text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ with gr.Blocks() as demo:
7
+ gr.Markdown("# Test App")
8
+ input_box = gr.Textbox(label="Test Input")
9
+ output_box = gr.Textbox(label="Output")
10
+ btn = gr.Button("Test")
11
+ btn.click(simple_test, inputs=input_box, outputs=output_box)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ demo.launch()