bestroi commited on
Commit
630b469
·
verified ·
1 Parent(s): 3207e10

Upload 6 files

Browse files
app.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ import re
5
+ import html
6
+ from pathlib import Path
7
+
8
+ # Function to load all CSV files from the current directory
9
+ def load_csv_files():
10
+ csv_files = {}
11
+ current_dir = Path(".")
12
+ for file in current_dir.glob("*_sorted.csv"):
13
+ try:
14
+ df = pd.read_csv(file, encoding='utf-8')
15
+ # Fill NaN values with empty strings to avoid issues
16
+ df = df.fillna("")
17
+ # Clean the city name from the filename
18
+ city_name = file.stem.replace('_sorted', '')
19
+ city_name = city_name.replace('_', ' ').title()
20
+ csv_files[city_name] = df
21
+ except Exception as e:
22
+ print(f"Error loading {file}: {e}")
23
+ return csv_files
24
+
25
+ # Function to get unique queries for a specific city
26
+ def get_queries_for_city(city):
27
+ if city not in all_data:
28
+ return []
29
+
30
+ # Get unique queries from the dataframe
31
+ queries = all_data[city]['query'].dropna().unique().tolist()
32
+
33
+ # Sort queries and filter out empty strings
34
+ queries = sorted([str(q) for q in queries if q and str(q).strip()])
35
+
36
+ return queries
37
+
38
+ # Function to find entries that have empty or missing queries
39
+ def find_empty_queries(city, preserve_order=True):
40
+ data = all_data.get(city)
41
+ if data is None:
42
+ return "City data not found"
43
+
44
+ results = []
45
+ for i, row in data.iterrows():
46
+ # Check if query is empty or NaN
47
+ if pd.isna(row['query']) or str(row['query']).strip() == "":
48
+ # Make sure all values are strings and handle NaN/None values
49
+ context = str(row['context']) if not pd.isna(row['context']) else ""
50
+ query = "(No Query)" if pd.isna(row['query']) else str(row['query'])
51
+ url = str(row['url']) if not pd.isna(row['url']) else ""
52
+
53
+ results.append({
54
+ 'url': url,
55
+ 'context': context,
56
+ 'query': query,
57
+ 'original_index': i # Store the original row index
58
+ })
59
+
60
+ # Format results using the same HTML formatting as search_data
61
+ if not results:
62
+ return "No entries without queries found"
63
+
64
+ # Sort results by their original index if preserve_order is True
65
+ if preserve_order:
66
+ results.sort(key=lambda x: x['original_index'])
67
+
68
+ # Create HTML formatted results for clickable links with better styling
69
+ formatted_results = "<div class='search-results'>"
70
+ for i, result in enumerate(results, 1):
71
+ url = result['url']
72
+ url_safe = html.escape(url)
73
+ original_idx = result['original_index'] + 1 # +1 for 1-based indexing for display
74
+
75
+ formatted_results += f"<div class='result-item'>"
76
+ formatted_results += f"<h3>Entry Without Query #{i} <span class='original-index'>(Dataset Row: {original_idx})</span></h3>"
77
+ formatted_results += f"<p><b>URL:</b> <a href='{url_safe}' target='_blank'>{url_safe}</a></p>"
78
+
79
+ # Handle context display safely
80
+ context = result['context']
81
+ try:
82
+ context_preview = context[:300] + ('...' if len(context) > 300 else '')
83
+ context_preview = html.escape(context_preview)
84
+ except (TypeError, AttributeError):
85
+ context_preview = html.escape(str(context))
86
+
87
+ formatted_results += f"<p><b>Context:</b> {context_preview}</p>"
88
+ formatted_results += "</div><hr>"
89
+
90
+ formatted_results += "</div>"
91
+ return formatted_results
92
+
93
+ # Function to search through the dataframes based on query
94
+ def search_data(city, search_type, search_query, case_sensitive=False, preserve_order=True):
95
+ data = all_data.get(city)
96
+ if data is None:
97
+ return "City data not found"
98
+
99
+ # Check if search_query is empty or None
100
+ if not search_query or str(search_query).strip() == "":
101
+ return "Please enter a search query"
102
+
103
+ # Ensure search_query is a string
104
+ search_query = str(search_query)
105
+
106
+ # Convert search query to lowercase if not case sensitive
107
+ if not case_sensitive:
108
+ search_query = search_query.lower()
109
+
110
+ results = []
111
+
112
+ if search_type == "Simple Text Search":
113
+ for i, row in data.iterrows():
114
+ # Make sure all values are strings and handle NaN/None values
115
+ context = str(row['context']) if not pd.isna(row['context']) else ""
116
+ query = str(row['query']) if not pd.isna(row['query']) else ""
117
+ url = str(row['url']) if not pd.isna(row['url']) else ""
118
+
119
+ # Check in context and query based on case sensitivity
120
+ context_to_check = context if case_sensitive else context.lower()
121
+ query_to_check = query if case_sensitive else query.lower()
122
+
123
+ if search_query in context_to_check or search_query in query_to_check:
124
+ results.append({
125
+ 'url': url,
126
+ 'context': context,
127
+ 'query': query,
128
+ 'original_index': i # Store the original row index
129
+ })
130
+
131
+ elif search_type == "Regular Expression Search":
132
+ try:
133
+ pattern = re.compile(search_query, flags=0 if case_sensitive else re.IGNORECASE)
134
+ for i, row in data.iterrows():
135
+ # Make sure all values are strings and handle NaN/None values
136
+ context = str(row['context']) if not pd.isna(row['context']) else ""
137
+ query = str(row['query']) if not pd.isna(row['query']) else ""
138
+ url = str(row['url']) if not pd.isna(row['url']) else ""
139
+
140
+ try:
141
+ if pattern.search(context) or pattern.search(query):
142
+ results.append({
143
+ 'url': url,
144
+ 'context': context,
145
+ 'query': query,
146
+ 'original_index': i # Store the original row index
147
+ })
148
+ except (TypeError, AttributeError) as e:
149
+ print(f"Error searching row {i}: {e}")
150
+ continue
151
+ except re.error as e:
152
+ return f"Regular expression error: {str(e)}"
153
+
154
+ # Format results
155
+ if not results:
156
+ return "No matching results found"
157
+
158
+ # Sort results by their original index if preserve_order is True
159
+ if preserve_order:
160
+ results.sort(key=lambda x: x['original_index'])
161
+
162
+ # Create HTML formatted results for clickable links with better styling
163
+ formatted_results = "<div class='search-results'>"
164
+ for i, result in enumerate(results, 1):
165
+ url = result['url']
166
+ url_safe = html.escape(url)
167
+ original_idx = result['original_index'] + 1 # +1 for 1-based indexing for display
168
+
169
+ formatted_results += f"<div class='result-item'>"
170
+ formatted_results += f"<h3>Result {i} <span class='original-index'>(Dataset Row: {original_idx})</span></h3>"
171
+ formatted_results += f"<p><b>URL:</b> <a href='{url_safe}' target='_blank'>{url_safe}</a></p>"
172
+ formatted_results += f"<p><b>Query:</b> {html.escape(str(result['query']))}</p>"
173
+
174
+ # Handle context display safely
175
+ context = result['context']
176
+ try:
177
+ context_preview = context[:300] + ('...' if len(context) > 300 else '')
178
+ context_preview = html.escape(context_preview)
179
+ except (TypeError, AttributeError):
180
+ context_preview = html.escape(str(context))
181
+
182
+ formatted_results += f"<p><b>Context:</b> {context_preview}</p>"
183
+ formatted_results += "</div><hr>"
184
+
185
+ formatted_results += "</div>"
186
+
187
+ return formatted_results
188
+
189
+ # Load all CSV files on startup
190
+ all_data = load_csv_files()
191
+ city_names = list(all_data.keys())
192
+ if not city_names:
193
+ city_names = ["No data found"]
194
+
195
+ # Create the Gradio interface
196
+ with gr.Blocks(title="Ancient Cities CSV Query") as app:
197
+ gr.Markdown("# Ancient Cities CSV Query Interface")
198
+ gr.Markdown("Search through information about ancient cities from CSV files.")
199
+
200
+ with gr.Row():
201
+ with gr.Column():
202
+ city_dropdown = gr.Dropdown(
203
+ choices=city_names,
204
+ value=city_names[0] if city_names else None,
205
+ label="Select City"
206
+ )
207
+
208
+ # Dropdown for queries based on the selected city
209
+ query_dropdown = gr.Dropdown(
210
+ choices=get_queries_for_city(city_names[0] if city_names else None),
211
+ label="Select a Query",
212
+ allow_custom_value=True
213
+ )
214
+
215
+ search_type = gr.Radio(
216
+ choices=["Simple Text Search", "Regular Expression Search"],
217
+ value="Simple Text Search",
218
+ label="Search Type"
219
+ )
220
+
221
+ # Keep a text box for custom queries
222
+ search_query = gr.Textbox(
223
+ label="Custom Search Query (optional)",
224
+ placeholder="Enter custom text to search for..."
225
+ )
226
+
227
+ case_sensitive = gr.Checkbox(
228
+ label="Case Sensitive",
229
+ value=False
230
+ )
231
+
232
+ show_empty_queries = gr.Checkbox(
233
+ label="Show Entries Without Queries",
234
+ value=False,
235
+ info="Check this to display entries that have empty or missing queries"
236
+ )
237
+
238
+ preserve_order = gr.Checkbox(
239
+ label="Preserve Original Dataset Order",
240
+ value=True,
241
+ info="When checked, results will be displayed in their original order from the dataset. When unchecked, results will be displayed in the order they are found."
242
+ )
243
+
244
+ search_button = gr.Button("Search")
245
+
246
+ with gr.Column():
247
+ results_text = gr.HTML(
248
+ label="Search Results",
249
+ value="",
250
+ elem_classes=["results-output"]
251
+ )
252
+
253
+ stats_text = gr.Textbox(
254
+ label="Dataset Statistics",
255
+ value=f"Total cities loaded: {len(city_names)}\nCities: {', '.join(city_names)}"
256
+ )
257
+
258
+ # Update the query dropdown when the city changes
259
+ def update_queries(city):
260
+ return gr.Dropdown(choices=get_queries_for_city(city))
261
+
262
+ city_dropdown.change(
263
+ fn=update_queries,
264
+ inputs=city_dropdown,
265
+ outputs=query_dropdown
266
+ )
267
+
268
+ # Use either the dropdown query or the custom search query
269
+ def search_with_queries(city, search_type, query_from_dropdown, custom_query, case_sensitive, show_empty_queries, preserve_order):
270
+ if show_empty_queries:
271
+ # If show_empty_queries is checked, we show entries without queries
272
+ return find_empty_queries(city, preserve_order)
273
+ else:
274
+ # Otherwise, use the custom query if provided, otherwise use the dropdown selection
275
+ final_query = custom_query if custom_query and custom_query.strip() else query_from_dropdown
276
+ return search_data(city, search_type, final_query, case_sensitive, preserve_order)
277
+
278
+ search_button.click(
279
+ fn=search_with_queries,
280
+ inputs=[city_dropdown, search_type, query_dropdown, search_query, case_sensitive, show_empty_queries, preserve_order],
281
+ outputs=results_text
282
+ )
283
+
284
+ # Launch the app
285
+ if __name__ == "__main__":
286
+ try:
287
+ print("Starting Ancient Cities Query Interface...")
288
+ print(f"Loaded {len(city_names)} cities: {', '.join(city_names)}")
289
+
290
+ # Add CSS within the Blocks instead of in launch()
291
+ with app:
292
+ gr.HTML("""
293
+ <style>
294
+ .gradio-container {
295
+ font-family: 'Arial', sans-serif;
296
+ }
297
+ .results-output {
298
+ max-height: 600px;
299
+ overflow-y: auto;
300
+ padding: 10px;
301
+ border: 1px solid #ddd;
302
+ border-radius: 5px;
303
+ }
304
+ a {
305
+ color: #007bff;
306
+ text-decoration: none;
307
+ }
308
+ a:hover {
309
+ text-decoration: underline;
310
+ }
311
+ b {
312
+ color: #333;
313
+ }
314
+ .search-results {
315
+ font-family: 'Arial', sans-serif;
316
+ }
317
+ .result-item {
318
+ margin-bottom: 15px;
319
+ padding: 10px;
320
+ background-color: #f9f9f9;
321
+ border-radius: 5px;
322
+ }
323
+ .result-item h3 {
324
+ margin-top: 0;
325
+ color: #333;
326
+ }
327
+ .original-index {
328
+ font-size: 0.8em;
329
+ color: #666;
330
+ font-weight: normal;
331
+ }
332
+ .result-item:nth-child(odd) {
333
+ background-color: #f5f5f5;
334
+ }
335
+ .result-item:nth-child(even) {
336
+ background-color: #ffffff;
337
+ }
338
+ hr {
339
+ border: 0;
340
+ height: 1px;
341
+ background-color: #ddd;
342
+ margin: 15px 0;
343
+ }
344
+ </style>
345
+ """)
346
+
347
+ app.launch(show_error=True)
348
+ except Exception as e:
349
+ print(f"Error starting application: {e}")
350
+ import traceback
351
+ traceback.print_exc()
dionysopolis_sorted.csv ADDED
The diff for this file is too large to render. See raw diff
 
nicopolis_ad_istrum_sorted.csv ADDED
The diff for this file is too large to render. See raw diff
 
nicopolis_ad_nestum_sorted.csv ADDED
The diff for this file is too large to render. See raw diff
 
oescus_sorted.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio==4.19.2
2
+ pandas==2.1.4