import gradio as gr import sqlite3 import chromadb from chromadb.utils import embedding_functions import os import shutil import zipfile def convert_to_vector_db(sqlite_file): if sqlite_file is None: return None # No file uploaded, return nothing # Copy uploaded file to a temporary path db_path = "temp.db" shutil.copy(sqlite_file, db_path) # Define vector DB path VECTOR_DB_PATH = "./legal_vector_db" # Clean existing directory if it exists if os.path.exists(VECTOR_DB_PATH): shutil.rmtree(VECTOR_DB_PATH) # Initialize ChromaDB persistent client chroma_client = chromadb.PersistentClient(path=VECTOR_DB_PATH) # Use multilingual embedding model suitable for Nepali sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction( model_name="paraphrase-multilingual-MiniLM-L12-v2" ) # Create collection collection = chroma_client.create_collection( name="legal_cases_collection", embedding_function=sentence_transformer_ef ) # Load data from SQLite with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute(""" SELECT लिङ्क, निर्णय_नं, साल, मुद्दाको_किसिम, विषय, निवेदक, विपक्षी, प्रकरण, ठहर FROM cases """) rows = cursor.fetchall() documents = [] metadatas = [] ids = [] for i, row in enumerate(rows): link, decision_no, year, mudda_type, subject, nibedak, vipakshi, prakaran, thahar = row # Combine text in Nepali format case_text = f""" मुद्दाको किसिम: {mudda_type} विषय: {subject} निवेदक: {nibedak} विपक्षी: {vipakshi} प्रकरण: {prakaran} ठहर: {thahar} """ documents.append(case_text.strip()) metadatas.append({ "link": link, "decision_no": decision_no, "year": year, "mudda_type": mudda_type, "subject": subject, "nibedak": nibedak, "vipakshi": vipakshi, "prakaran": prakaran, "thahar": thahar }) ids.append(f"case_{i}") # Add to collection in batches batch_size = 100 for i in range(0, len(documents), batch_size): batch_docs = documents[i:i+batch_size] batch_meta = metadatas[i:i+batch_size] batch_ids = ids[i:i+batch_size] collection.add( documents=batch_docs, metadatas=batch_meta, ids=batch_ids ) # Zip the vector DB directory zip_path = "legal_vector_db.zip" if os.path.exists(zip_path): os.remove(zip_path) with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, dirs, files in os.walk(VECTOR_DB_PATH): for file in files: zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(VECTOR_DB_PATH, '..'))) # Clean up temp DB os.remove(db_path) return zip_path # Gradio interface with gr.Blocks() as demo: gr.Markdown("SQLite DB to Vector DB Converter (Nepali Legal Cases Supported)") sqlite_upload = gr.File(label="Upload SQLite DB File (e.g., after_2061.db)") convert_btn = gr.Button("Convert to Vector DB") download_file = gr.File(label="Download Vector DB (ZIP File)") convert_btn.click(convert_to_vector_db, inputs=sqlite_upload, outputs=download_file) if __name__ == "__main__": demo.launch()