import pandas as pd import sqlite3 import gradio as gr import unicodedata import re import ast import requests from pathlib import Path from typing import Optional # --- Constants --- DATABASE_URL = "https://raw.githubusercontent.com/R3gm/database_zip_files/main/archive/database.csv" DATABASE_PATH = Path("database.csv") DB_CONNECTION = None # --- UI Configuration --- APP_TITLE = "## 🔍 RVC Voice Finder" APP_DESCRIPTION = ( "This app digs through Hugging Face’s public zip files hunting for RVC models… " "and occasionally brings back random stuff that has nothing to do with them. " "Don’t worry though—the best matches are always shown first." ) # --- Function Definitions --- def setup_database() -> Optional[sqlite3.Connection]: """ Downloads the database, preprocesses it, and loads it into an in-memory SQLite FTS5 table for fast text searching. """ print("Setting up the database...") try: # Download the database file print(f"Downloading data from {DATABASE_URL}...") response = requests.get(DATABASE_URL, stream=True, timeout=30) response.raise_for_status() with open(DATABASE_PATH, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print("Download complete.") except requests.exceptions.RequestException as e: print(f"Error downloading the database: {e}") # Use local file if it exists, otherwise fail if not DATABASE_PATH.exists(): raise FileNotFoundError(f"Failed to download and local copy not found at {DATABASE_PATH}") from e print("Using existing local database file.") # Load and preprocess the data with pandas df = pd.read_csv(DATABASE_PATH) df.rename(columns={"FILENAME": "Filename", "PARSED_URL": "URL", "MODEL_ID": "Repo ID"}, inplace=True) df['normalized_filename'] = df['Filename'].apply(normalize_text) df['URL'] = df['URL'].apply(clean_file_url) df = df.reset_index().rename(columns={'index': 'rowid'}) # Use original index as rowid # Connect to an in-memory SQLite database conn = sqlite3.connect(":memory:", check_same_thread=False) # Load the main data into a standard table df.to_sql("models", conn, index=False, if_exists="replace") # Create and populate the FTS5 virtual table for fast searching conn.execute(""" CREATE VIRTUAL TABLE models_fts USING fts5( Filename, normalized_filename, URL, 'Repo ID', content='models', content_rowid='rowid' ); """) conn.execute(""" INSERT INTO models_fts(rowid, Filename, normalized_filename, URL, "Repo ID") SELECT rowid, Filename, normalized_filename, URL, "Repo ID" FROM models; """) print("Database setup complete and loaded into memory.") return conn def normalize_text(text: str) -> str: """ Cleans and standardizes text for searching by lowercasing, removing accents, and replacing separators with spaces. """ if pd.isna(text): return "" text = text.lower() text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn') return re.sub(r"[+()\-_/.]", " ", text) def clean_file_url(val) -> str: """ Cleans the URL column, handling lists stored as strings. """ if pd.isna(val): return "" if isinstance(val, str) and val.strip().startswith("["): try: # Safely evaluate string representation of a list parsed_list = ast.literal_eval(val) return ", ".join(map(str, parsed_list)) if isinstance(parsed_list, list) else val except (ValueError, SyntaxError): return val # Return original string if parsing fails return str(val) def search_models(query: str) -> Optional[pd.DataFrame]: """ Searches the FTS table for models matching the query. """ if not query.strip() or DB_CONNECTION is None: return None # Sanitize query and prepare for FTS by joining with "AND" keywords = normalize_text(query).split() fts_query = " AND ".join(keywords) if not fts_query: return None # Use FTS MATCH operator for efficient search sql_query = f""" SELECT Filename, URL, "Repo ID" FROM models_fts WHERE normalized_filename MATCH ? ORDER BY rank LIMIT 250; """ try: df_results = pd.read_sql_query(sql_query, DB_CONNECTION, params=(fts_query,)) except sqlite3.OperationalError as e: # This can happen if FTS query syntax is invalid gr.Warning(f"Search error: {e}") return None if df_results.empty: gr.Info("No matches found for your query.") return None return df_results # --- Main Execution & Gradio App --- if __name__ == "__main__": DB_CONNECTION = setup_database() with gr.Blocks() as demo: gr.Markdown(APP_TITLE) with gr.Row(): query_input = gr.Textbox( label="Search here", placeholder="e.g., Hatsune Miku", scale=4, ) search_button = gr.Button("Search", variant="primary", scale=1) output_df = gr.HTML(label="Search Results") gr.Markdown(APP_DESCRIPTION) # Event listeners query_input.submit(search_models, inputs=query_input, outputs=output_df) search_button.click(search_models, inputs=query_input, outputs=output_df) demo.launch(debug=True, show_error=True)