rbbist's picture
Create app.py
05e4e1f verified
import gradio as gr
import sqlite3
import chromadb
from chromadb.utils import embedding_functions
import os
import shutil
import zipfile
def convert_to_vector_db(sqlite_file):
if sqlite_file is None:
return None # No file uploaded, return nothing
# Copy uploaded file to a temporary path
db_path = "temp.db"
shutil.copy(sqlite_file, db_path)
# Define vector DB path
VECTOR_DB_PATH = "./legal_vector_db"
# Clean existing directory if it exists
if os.path.exists(VECTOR_DB_PATH):
shutil.rmtree(VECTOR_DB_PATH)
# Initialize ChromaDB persistent client
chroma_client = chromadb.PersistentClient(path=VECTOR_DB_PATH)
# Use multilingual embedding model suitable for Nepali
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="paraphrase-multilingual-MiniLM-L12-v2"
)
# Create collection
collection = chroma_client.create_collection(
name="legal_cases_collection",
embedding_function=sentence_transformer_ef
)
# Load data from SQLite
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT लिङ्क, निर्णय_नं, साल, मुद्दाको_किसिम, विषय, निवेदक, विपक्षी, प्रकरण, ठहर
FROM cases
""")
rows = cursor.fetchall()
documents = []
metadatas = []
ids = []
for i, row in enumerate(rows):
link, decision_no, year, mudda_type, subject, nibedak, vipakshi, prakaran, thahar = row
# Combine text in Nepali format
case_text = f"""
मुद्दाको किसिम: {mudda_type}
विषय: {subject}
निवेदक: {nibedak}
विपक्षी: {vipakshi}
प्रकरण: {prakaran}
ठहर: {thahar}
"""
documents.append(case_text.strip())
metadatas.append({
"link": link,
"decision_no": decision_no,
"year": year,
"mudda_type": mudda_type,
"subject": subject,
"nibedak": nibedak,
"vipakshi": vipakshi,
"prakaran": prakaran,
"thahar": thahar
})
ids.append(f"case_{i}")
# Add to collection in batches
batch_size = 100
for i in range(0, len(documents), batch_size):
batch_docs = documents[i:i+batch_size]
batch_meta = metadatas[i:i+batch_size]
batch_ids = ids[i:i+batch_size]
collection.add(
documents=batch_docs,
metadatas=batch_meta,
ids=batch_ids
)
# Zip the vector DB directory
zip_path = "legal_vector_db.zip"
if os.path.exists(zip_path):
os.remove(zip_path)
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(VECTOR_DB_PATH):
for file in files:
zipf.write(os.path.join(root, file),
os.path.relpath(os.path.join(root, file),
os.path.join(VECTOR_DB_PATH, '..')))
# Clean up temp DB
os.remove(db_path)
return zip_path
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("SQLite DB to Vector DB Converter (Nepali Legal Cases Supported)")
sqlite_upload = gr.File(label="Upload SQLite DB File (e.g., after_2061.db)")
convert_btn = gr.Button("Convert to Vector DB")
download_file = gr.File(label="Download Vector DB (ZIP File)")
convert_btn.click(convert_to_vector_db, inputs=sqlite_upload, outputs=download_file)
if __name__ == "__main__":
demo.launch()