Spaces:

bestroi
/

ArchaeoQuery

Sleeping

App Files Files Community

bestroi commited on Jul 29

Commit

14715bc

verified ·

1 Parent(s): 630b469

Update app.py

Browse files

Files changed (1) hide show

app.py +350 -351

app.py CHANGED Viewed

@@ -1,351 +1,350 @@
-import gradio as gr
-import pandas as pd
-import os
-import re
-import html
-from pathlib import Path
-# Function to load all CSV files from the current directory
-def load_csv_files():
-    csv_files = {}
-    current_dir = Path(".")
-    for file in current_dir.glob("*_sorted.csv"):
-        try:
-            df = pd.read_csv(file, encoding='utf-8')
-            # Fill NaN values with empty strings to avoid issues
-            df = df.fillna("")
-            # Clean the city name from the filename
-            city_name = file.stem.replace('_sorted', '')
-            city_name = city_name.replace('_', ' ').title()
-            csv_files[city_name] = df
-        except Exception as e:
-            print(f"Error loading {file}: {e}")
-    return csv_files
-# Function to get unique queries for a specific city
-def get_queries_for_city(city):
-    if city not in all_data:
-        return []
-    # Get unique queries from the dataframe
-    queries = all_data[city]['query'].dropna().unique().tolist()
-    # Sort queries and filter out empty strings
-    queries = sorted([str(q) for q in queries if q and str(q).strip()])
-    return queries
-# Function to find entries that have empty or missing queries
-def find_empty_queries(city, preserve_order=True):
-    data = all_data.get(city)
-    if data is None:
-        return "City data not found"
-    results = []
-    for i, row in data.iterrows():
-        # Check if query is empty or NaN
-        if pd.isna(row['query']) or str(row['query']).strip() == "":
-            # Make sure all values are strings and handle NaN/None values
-            context = str(row['context']) if not pd.isna(row['context']) else ""
-            query = "(No Query)" if pd.isna(row['query']) else str(row['query'])
-            url = str(row['url']) if not pd.isna(row['url']) else ""
-            results.append({
-                'url': url,
-                'context': context,
-                'query': query,
-                'original_index': i  # Store the original row index
-            })
-    # Format results using the same HTML formatting as search_data
-    if not results:
-        return "No entries without queries found"
-    # Sort results by their original index if preserve_order is True
-    if preserve_order:
-        results.sort(key=lambda x: x['original_index'])
-    # Create HTML formatted results for clickable links with better styling
-    formatted_results = "<div class='search-results'>"
-    for i, result in enumerate(results, 1):
-        url = result['url']
-        url_safe = html.escape(url)
-        original_idx = result['original_index'] + 1  # +1 for 1-based indexing for display
-        formatted_results += f"<div class='result-item'>"
-        formatted_results += f"<h3>Entry Without Query #{i} <span class='original-index'>(Dataset Row: {original_idx})</span></h3>"
-        formatted_results += f"<p><b>URL:</b> <a href='{url_safe}' target='_blank'>{url_safe}</a></p>"
-        # Handle context display safely
-        context = result['context']
-        try:
-            context_preview = context[:300] + ('...' if len(context) > 300 else '')
-            context_preview = html.escape(context_preview)
-        except (TypeError, AttributeError):
-            context_preview = html.escape(str(context))
-        formatted_results += f"<p><b>Context:</b> {context_preview}</p>"
-        formatted_results += "</div><hr>"
-    formatted_results += "</div>"
-    return formatted_results
-# Function to search through the dataframes based on query
-def search_data(city, search_type, search_query, case_sensitive=False, preserve_order=True):
-    data = all_data.get(city)
-    if data is None:
-        return "City data not found"
-    # Check if search_query is empty or None
-    if not search_query or str(search_query).strip() == "":
-        return "Please enter a search query"
-    # Ensure search_query is a string
-    search_query = str(search_query)
-    # Convert search query to lowercase if not case sensitive
-    if not case_sensitive:
-        search_query = search_query.lower()
-    results = []
-    if search_type == "Simple Text Search":
-        for i, row in data.iterrows():
-            # Make sure all values are strings and handle NaN/None values
-            context = str(row['context']) if not pd.isna(row['context']) else ""
-            query = str(row['query']) if not pd.isna(row['query']) else ""
-            url = str(row['url']) if not pd.isna(row['url']) else ""
-            # Check in context and query based on case sensitivity
-            context_to_check = context if case_sensitive else context.lower()
-            query_to_check = query if case_sensitive else query.lower()
-            if search_query in context_to_check or search_query in query_to_check:
-                results.append({
-                    'url': url,
-                    'context': context,
-                    'query': query,
-                    'original_index': i  # Store the original row index
-                })
-    elif search_type == "Regular Expression Search":
-        try:
-            pattern = re.compile(search_query, flags=0 if case_sensitive else re.IGNORECASE)
-            for i, row in data.iterrows():
-                # Make sure all values are strings and handle NaN/None values
-                context = str(row['context']) if not pd.isna(row['context']) else ""
-                query = str(row['query']) if not pd.isna(row['query']) else ""
-                url = str(row['url']) if not pd.isna(row['url']) else ""
-                try:
-                    if pattern.search(context) or pattern.search(query):
-                        results.append({
-                            'url': url,
-                            'context': context,
-                            'query': query,
-                            'original_index': i  # Store the original row index
-                        })
-                except (TypeError, AttributeError) as e:
-                    print(f"Error searching row {i}: {e}")
-                    continue
-        except re.error as e:
-            return f"Regular expression error: {str(e)}"
-    # Format results
-    if not results:
-        return "No matching results found"
-    # Sort results by their original index if preserve_order is True
-    if preserve_order:
-        results.sort(key=lambda x: x['original_index'])
-    # Create HTML formatted results for clickable links with better styling
-    formatted_results = "<div class='search-results'>"
-    for i, result in enumerate(results, 1):
-        url = result['url']
-        url_safe = html.escape(url)
-        original_idx = result['original_index'] + 1  # +1 for 1-based indexing for display
-        formatted_results += f"<div class='result-item'>"
-        formatted_results += f"<h3>Result {i} <span class='original-index'>(Dataset Row: {original_idx})</span></h3>"
-        formatted_results += f"<p><b>URL:</b> <a href='{url_safe}' target='_blank'>{url_safe}</a></p>"
-        formatted_results += f"<p><b>Query:</b> {html.escape(str(result['query']))}</p>"
-        # Handle context display safely
-        context = result['context']
-        try:
-            context_preview = context[:300] + ('...' if len(context) > 300 else '')
-            context_preview = html.escape(context_preview)
-        except (TypeError, AttributeError):
-            context_preview = html.escape(str(context))
-        formatted_results += f"<p><b>Context:</b> {context_preview}</p>"
-        formatted_results += "</div><hr>"
-    formatted_results += "</div>"
-    return formatted_results
-# Load all CSV files on startup
-all_data = load_csv_files()
-city_names = list(all_data.keys())
-if not city_names:
-    city_names = ["No data found"]
-# Create the Gradio interface
-with gr.Blocks(title="Ancient Cities CSV Query") as app:
-    gr.Markdown("# Ancient Cities CSV Query Interface")
-    gr.Markdown("Search through information about ancient cities from CSV files.")
-    with gr.Row():
-        with gr.Column():
-            city_dropdown = gr.Dropdown(
-                choices=city_names,
-                value=city_names[0] if city_names else None,
-                label="Select City"
-            )
-            # Dropdown for queries based on the selected city
-            query_dropdown = gr.Dropdown(
-                choices=get_queries_for_city(city_names[0] if city_names else None),
-                label="Select a Query",
-                allow_custom_value=True
-            )
-            search_type = gr.Radio(
-                choices=["Simple Text Search", "Regular Expression Search"],
-                value="Simple Text Search",
-                label="Search Type"
-            )
-            # Keep a text box for custom queries
-            search_query = gr.Textbox(
-                label="Custom Search Query (optional)",
-                placeholder="Enter custom text to search for..."
-            )
-            case_sensitive = gr.Checkbox(
-                label="Case Sensitive",
-                value=False
-            )
-            show_empty_queries = gr.Checkbox(
-                label="Show Entries Without Queries",
-                value=False,
-                info="Check this to display entries that have empty or missing queries"
-            )
-            preserve_order = gr.Checkbox(
-                label="Preserve Original Dataset Order",
-                value=True,
-                info="When checked, results will be displayed in their original order from the dataset. When unchecked, results will be displayed in the order they are found."
-            )
-            search_button = gr.Button("Search")
-        with gr.Column():
-            results_text = gr.HTML(
-                label="Search Results",
-                value="",
-                elem_classes=["results-output"]
-            )
-            stats_text = gr.Textbox(
-                label="Dataset Statistics",
-                value=f"Total cities loaded: {len(city_names)}\nCities: {', '.join(city_names)}"
-            )
-    # Update the query dropdown when the city changes
-    def update_queries(city):
-        return gr.Dropdown(choices=get_queries_for_city(city))
-    city_dropdown.change(
-        fn=update_queries,
-        inputs=city_dropdown,
-        outputs=query_dropdown
-    )
-    # Use either the dropdown query or the custom search query
-    def search_with_queries(city, search_type, query_from_dropdown, custom_query, case_sensitive, show_empty_queries, preserve_order):
-        if show_empty_queries:
-            # If show_empty_queries is checked, we show entries without queries
-            return find_empty_queries(city, preserve_order)
-        else:
-            # Otherwise, use the custom query if provided, otherwise use the dropdown selection
-            final_query = custom_query if custom_query and custom_query.strip() else query_from_dropdown
-            return search_data(city, search_type, final_query, case_sensitive, preserve_order)
-    search_button.click(
-        fn=search_with_queries,
-        inputs=[city_dropdown, search_type, query_dropdown, search_query, case_sensitive, show_empty_queries, preserve_order],
-        outputs=results_text
-    )
-# Launch the app
-if __name__ == "__main__":
-    try:
-        print("Starting Ancient Cities Query Interface...")
-        print(f"Loaded {len(city_names)} cities: {', '.join(city_names)}")
-        # Add CSS within the Blocks instead of in launch()
-        with app:
-            gr.HTML("""
-            <style>
-            .gradio-container {
-                font-family: 'Arial', sans-serif;
-            }
-            .results-output {
-                max-height: 600px;
-                overflow-y: auto;
-                padding: 10px;
-                border: 1px solid #ddd;
-                border-radius: 5px;
-            }
-            a {
-                color: #007bff;
-                text-decoration: none;
-            }
-            a:hover {
-                text-decoration: underline;
-            }
-            b {
-                color: #333;
-            }
-            .search-results {
-                font-family: 'Arial', sans-serif;
-            }
-            .result-item {
-                margin-bottom: 15px;
-                padding: 10px;
-                background-color: #f9f9f9;
-                border-radius: 5px;
-            }
-            .result-item h3 {
-                margin-top: 0;
-                color: #333;
-            }
-            .original-index {
-                font-size: 0.8em;
-                color: #666;
-                font-weight: normal;
-            }
-            .result-item:nth-child(odd) {
-                background-color: #f5f5f5;
-            }
-            .result-item:nth-child(even) {
-                background-color: #ffffff;
-            }
-            hr {
-                border: 0;
-                height: 1px;
-                background-color: #ddd;
-                margin: 15px 0;
-            }
-            </style>
-            """)
-        app.launch(show_error=True)
-    except Exception as e:
-        print(f"Error starting application: {e}")
-        import traceback
-        traceback.print_exc()

+import gradio as gr
+import pandas as pd
+import os
+import re
+import html
+from pathlib import Path
+# Function to load all CSV files from the current directory
+def load_csv_files():
+    csv_files = {}
+    current_dir = Path(".")
+    for file in current_dir.glob("*_sorted.csv"):
+        try:
+            df = pd.read_csv(file, encoding='utf-8')
+            # Fill NaN values with empty strings to avoid issues
+            df = df.fillna("")
+            # Clean the city name from the filename
+            city_name = file.stem.replace('_sorted', '')
+            city_name = city_name.replace('_', ' ').title()
+            csv_files[city_name] = df
+        except Exception as e:
+            print(f"Error loading {file}: {e}")
+    return csv_files
+# Function to get unique queries for a specific city
+def get_queries_for_city(city):
+    if city not in all_data:
+        return []
+    # Get unique queries from the dataframe
+    queries = all_data[city]['query'].dropna().unique().tolist()
+    # Sort queries and filter out empty strings
+    queries = sorted([str(q) for q in queries if q and str(q).strip()])
+    return queries
+# Function to find entries that have empty or missing queries
+def find_empty_queries(city, preserve_order=True):
+    data = all_data.get(city)
+    if data is None:
+        return "City data not found"
+    results = []
+    for i, row in data.iterrows():
+        # Check if query is empty or NaN
+        if pd.isna(row['query']) or str(row['query']).strip() == "":
+            # Make sure all values are strings and handle NaN/None values
+            context = str(row['context']) if not pd.isna(row['context']) else ""
+            query = "(No Query)" if pd.isna(row['query']) else str(row['query'])
+            url = str(row['url']) if not pd.isna(row['url']) else ""
+            results.append({
+                'url': url,
+                'context': context,
+                'query': query,
+                'original_index': i  # Store the original row index
+            })
+    # Format results using the same HTML formatting as search_data
+    if not results:
+        return "No entries without queries found"
+    # Sort results by their original index if preserve_order is True
+    if preserve_order:
+        results.sort(key=lambda x: x['original_index'])
+    # Create HTML formatted results for clickable links with better styling
+    formatted_results = "<div class='search-results'>"
+    for i, result in enumerate(results, 1):
+        url = result['url']
+        url_safe = html.escape(url)
+        original_idx = result['original_index'] + 1  # +1 for 1-based indexing for display
+        formatted_results += f"<div class='result-item'>"
+        formatted_results += f"<h3>Entry Without Query #{i} <span class='original-index'>(Dataset Row: {original_idx})</span></h3>"
+        formatted_results += f"<p><b>URL:</b> <a href='{url_safe}' target='_blank'>{url_safe}</a></p>"
+        # Handle context display safely
+        context = result['context']
+        try:
+            context_preview = context[:300] + ('...' if len(context) > 300 else '')
+            context_preview = html.escape(context_preview)
+        except (TypeError, AttributeError):
+            context_preview = html.escape(str(context))
+        formatted_results += f"<p><b>Context:</b> {context_preview}</p>"
+        formatted_results += "</div><hr>"
+    formatted_results += "</div>"
+    return formatted_results
+# Function to search through the dataframes based on query
+def search_data(city, search_type, search_query, case_sensitive=False, preserve_order=True):
+    data = all_data.get(city)
+    if data is None:
+        return "City data not found"
+    # Check if search_query is empty or None
+    if not search_query or str(search_query).strip() == "":
+        return "Please enter a search query"
+    # Ensure search_query is a string
+    search_query = str(search_query)
+    # Convert search query to lowercase if not case sensitive
+    if not case_sensitive:
+        search_query = search_query.lower()
+    results = []
+    if search_type == "Simple Text Search":
+        for i, row in data.iterrows():
+            # Make sure all values are strings and handle NaN/None values
+            context = str(row['context']) if not pd.isna(row['context']) else ""
+            query = str(row['query']) if not pd.isna(row['query']) else ""
+            url = str(row['url']) if not pd.isna(row['url']) else ""
+            # Check in context and query based on case sensitivity
+            context_to_check = context if case_sensitive else context.lower()
+            query_to_check = query if case_sensitive else query.lower()
+            if search_query in context_to_check or search_query in query_to_check:
+                results.append({
+                    'url': url,
+                    'context': context,
+                    'query': query,
+                    'original_index': i  # Store the original row index
+                })
+    elif search_type == "Regular Expression Search":
+        try:
+            pattern = re.compile(search_query, flags=0 if case_sensitive else re.IGNORECASE)
+            for i, row in data.iterrows():
+                # Make sure all values are strings and handle NaN/None values
+                context = str(row['context']) if not pd.isna(row['context']) else ""
+                query = str(row['query']) if not pd.isna(row['query']) else ""
+                url = str(row['url']) if not pd.isna(row['url']) else ""
+                try:
+                    if pattern.search(context) or pattern.search(query):
+                        results.append({
+                            'url': url,
+                            'context': context,
+                            'query': query,
+                            'original_index': i  # Store the original row index
+                        })
+                except (TypeError, AttributeError) as e:
+                    print(f"Error searching row {i}: {e}")
+                    continue
+        except re.error as e:
+            return f"Regular expression error: {str(e)}"
+    # Format results
+    if not results:
+        return "No matching results found"
+    # Sort results by their original index if preserve_order is True
+    if preserve_order:
+        results.sort(key=lambda x: x['original_index'])
+    # Create HTML formatted results for clickable links with better styling
+    formatted_results = "<div class='search-results'>"
+    for i, result in enumerate(results, 1):
+        url = result['url']
+        url_safe = html.escape(url)
+        original_idx = result['original_index'] + 1  # +1 for 1-based indexing for display
+        formatted_results += f"<div class='result-item'>"
+        formatted_results += f"<h3>Result {i} <span class='original-index'>(Dataset Row: {original_idx})</span></h3>"
+        formatted_results += f"<p><b>URL:</b> <a href='{url_safe}' target='_blank'>{url_safe}</a></p>"
+        formatted_results += f"<p><b>Query:</b> {html.escape(str(result['query']))}</p>"
+        # Handle context display safely
+        context = result['context']
+        try:
+            context_preview = context[:300] + ('...' if len(context) > 300 else '')
+            context_preview = html.escape(context_preview)
+        except (TypeError, AttributeError):
+            context_preview = html.escape(str(context))
+        formatted_results += f"<p><b>Context:</b> {context_preview}</p>"
+        formatted_results += "</div><hr>"
+    formatted_results += "</div>"
+    return formatted_results
+# Load all CSV files on startup
+all_data = load_csv_files()
+city_names = list(all_data.keys())
+if not city_names:
+    city_names = ["No data found"]
+# Create the Gradio interface
+with gr.Blocks(title="Query engine") as app:
+    gr.Markdown("# Archaelogical Query Engine")
+    with gr.Row():
+        with gr.Column():
+            city_dropdown = gr.Dropdown(
+                choices=city_names,
+                value=city_names[0] if city_names else None,
+                label="Select City"
+            )
+            # Dropdown for queries based on the selected city
+            query_dropdown = gr.Dropdown(
+                choices=get_queries_for_city(city_names[0] if city_names else None),
+                label="Select a Query",
+                allow_custom_value=True
+            )
+            search_type = gr.Radio(
+                choices=["Simple Text Search", "Regular Expression Search"],
+                value="Simple Text Search",
+                label="Search Type"
+            )
+            # Keep a text box for custom queries
+            search_query = gr.Textbox(
+                label="Custom Search Query (optional)",
+                placeholder="Enter custom text to search for..."
+            )
+            case_sensitive = gr.Checkbox(
+                label="Case Sensitive",
+                value=False
+            )
+            show_empty_queries = gr.Checkbox(
+                label="Show Entries Without Queries",
+                value=False,
+                info="Check this to display entries that have empty or missing queries"
+            )
+            preserve_order = gr.Checkbox(
+                label="Preserve Original Dataset Order",
+                value=True,
+                info="When checked, results will be displayed in their original order from the dataset. When unchecked, results will be displayed in the order they are found."
+            )
+            search_button = gr.Button("Search")
+        with gr.Column():
+            results_text = gr.HTML(
+                label="Search Results",
+                value="",
+                elem_classes=["results-output"]
+            )
+            stats_text = gr.Textbox(
+                label="Dataset Statistics",
+                value=f"Total cities loaded: {len(city_names)}\nCities: {', '.join(city_names)}"
+            )
+    # Update the query dropdown when the city changes
+    def update_queries(city):
+        return gr.Dropdown(choices=get_queries_for_city(city))
+    city_dropdown.change(
+        fn=update_queries,
+        inputs=city_dropdown,
+        outputs=query_dropdown
+    )
+    # Use either the dropdown query or the custom search query
+    def search_with_queries(city, search_type, query_from_dropdown, custom_query, case_sensitive, show_empty_queries, preserve_order):
+        if show_empty_queries:
+            # If show_empty_queries is checked, we show entries without queries
+            return find_empty_queries(city, preserve_order)
+        else:
+            # Otherwise, use the custom query if provided, otherwise use the dropdown selection
+            final_query = custom_query if custom_query and custom_query.strip() else query_from_dropdown
+            return search_data(city, search_type, final_query, case_sensitive, preserve_order)
+    search_button.click(
+        fn=search_with_queries,
+        inputs=[city_dropdown, search_type, query_dropdown, search_query, case_sensitive, show_empty_queries, preserve_order],
+        outputs=results_text
+    )
+# Launch the app
+if __name__ == "__main__":
+    try:
+        print("Starting Ancient Cities Query Interface...")
+        print(f"Loaded {len(city_names)} cities: {', '.join(city_names)}")
+        # Add CSS within the Blocks instead of in launch()
+        with app:
+            gr.HTML("""
+            <style>
+            .gradio-container {
+                font-family: 'Arial', sans-serif;
+            }
+            .results-output {
+                max-height: 600px;
+                overflow-y: auto;
+                padding: 10px;
+                border: 1px solid #ddd;
+                border-radius: 5px;
+            }
+            a {
+                color: #007bff;
+                text-decoration: none;
+            }
+            a:hover {
+                text-decoration: underline;
+            }
+            b {
+                color: #333;
+            }
+            .search-results {
+                font-family: 'Arial', sans-serif;
+            }
+            .result-item {
+                margin-bottom: 15px;
+                padding: 10px;
+                background-color: #f9f9f9;
+                border-radius: 5px;
+            }
+            .result-item h3 {
+                margin-top: 0;
+                color: #333;
+            }
+            .original-index {
+                font-size: 0.8em;
+                color: #666;
+                font-weight: normal;
+            }
+            .result-item:nth-child(odd) {
+                background-color: #f5f5f5;
+            }
+            .result-item:nth-child(even) {
+                background-color: #ffffff;
+            }
+            hr {
+                border: 0;
+                height: 1px;
+                background-color: #ddd;
+                margin: 15px 0;
+            }
+            </style>
+            """)
+        app.launch(show_error=True)
+    except Exception as e:
+        print(f"Error starting application: {e}")
+        import traceback
+        traceback.print_exc()