sanatan_ai / copy_chromadb.py
vikramvasudevan's picture
Upload folder using huggingface_hub
1d00c8a verified
import argparse
import chromadb
from tqdm import tqdm # Optional: For progress bar
db_config = {
"youtube_db": {
"source_db_path": "../youtube_surfer_ai_agent/youtube_db",
"source_collection_name": "yt_metadata",
"destination_collection_name": "yt_metadata",
},
"divya_prabandham": {
"source_db_path": "../uveda_analyzer/chromadb_store",
"source_collection_name": "divya_prabandham",
"destination_collection_name": "divya_prabandham",
},
"divya_prabandham_taniyans": {
"source_db_path": "../uveda_analyzer/chromadb_store",
"source_collection_name": "divya_prabandham_taniyans",
"destination_collection_name": "divya_prabandham_taniyans",
},
"vishnu_sahasranamam": {
"source_db_path": "../vishnu_sahasranamam_ai/output/chroma_store",
"source_collection_name": "vishnu_sahasranamam",
"destination_collection_name": "vishnu_sahasranamam_openai",
},
"bhagavat_gita": {
"source_db_path": "../bhagavat_gita_chat/chromadb_store",
"source_collection_name": "bhagavat_gita",
"destination_collection_name": "bhagavat_gita_openai",
},
"pancha_sooktham": {
"source_db_path": "../sooktham_ai/chromadb_store",
"source_collection_name": "pancha_sooktham",
"destination_collection_name": "pancha_sooktham",
},
"taitriya_upanishad": {
"source_db_path": "../taitriya_upanishad_ai/chromadb_store",
"source_collection_name": "taitriya_upanishad",
"destination_collection_name": "taitriya_upanishad",
},
"shanthi_panchakam": {
"source_db_path": "../shanthi_panchakam_ai/chromadb_store",
"source_collection_name": "shanthi_panchakam",
"destination_collection_name": "shanthi_panchakam",
},
"taitriya_samhitha": {
"source_db_path": "../taitriya_samhitha_ai/chromadb_store",
"source_collection_name": "taitriya_samhitha",
"destination_collection_name": "taitriya_samhitha",
},
"taitriya_brahmanam": {
"source_db_path": "../taitriya_brahmanam_ai/chromadb_store",
"source_collection_name": "taitriya_brahmanam",
"destination_collection_name": "taitriya_brahmanam",
},
"katakam": {
"source_db_path": "../taitriya_brahmanam_ai/chromadb_store",
"source_collection_name": "katakam",
"destination_collection_name": "katakam",
},
"sri_stavam": {
"source_db_path": "../vedam_ai/chromadb-store",
"source_collection_name": "sri_stavam",
"destination_collection_name": "sri_stavam",
},
"taitriya_aranyakam": {
"source_db_path": "../taitriya_aranyakam_ai/chromadb_store",
"source_collection_name": "taitriya_aranyakam",
"destination_collection_name": "taitriya_aranyakam",
},
"brahma_sutra": {
"source_db_path": "../brahma_sutra_ai/chromadb_store",
"source_collection_name": "brahma_sutra",
"destination_collection_name": "brahma_sutra",
},
"valmiki_ramayanam": {
"source_db_path": "../valmiki_ramayanam_ai/chromadb_store",
"source_collection_name": "valmiki_ramayanam",
"destination_collection_name": "valmiki_ramayanam_openai",
},
"sri_vachana_bhushanam": {
"source_db_path": "../sri_vachana_bhushanam_ai/chromadb_store",
"source_collection_name": "sri_vachana_bhushanam",
"destination_collection_name": "sri_vachana_bhushanam",
},
"desika_prabandham": {
"source_db_path": "../desika_prabandham_ai/chromadb_store",
"source_collection_name": "desika_prabandham",
"destination_collection_name": "desika_prabandham",
},
"raghuveera_gadhyam": {
"source_db_path": "../raghuveera_gadhyam_ai/chromadb_store",
"source_collection_name": "raghuveera_gadhyam",
"destination_collection_name": "raghuveera_gadhyam",
},
"narayaneeyam": {
"source_db_path": "../narayaneeyam_ai/chromadb_store",
"source_collection_name": "narayaneeyam",
"destination_collection_name": "narayaneeyam",
},
"bhagavata_purana": {
"source_db_path": "../bhagavata_purana_ai/chromadb_store",
"source_collection_name": "bhagavata_purana",
"destination_collection_name": "bhagavata_purana",
},
"agnipuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "agnipuranam",
"destination_collection_name": "agnipuranam"
},
"bhavishyapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "bhavishyapuranam",
"destination_collection_name": "bhavishyapuranam"
},
"brahmandpuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "brahmandpuranam",
"destination_collection_name": "brahmandpuranam"
},
"brahmapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "brahmapuranam",
"destination_collection_name": "brahmapuranam"
},
"brahmavaivarthapurana": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "brahmavaivarthapurana",
"destination_collection_name": "brahmavaivarthapurana"
},
"garudapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "garudapuranam",
"destination_collection_name": "garudapuranam"
},
"harivanshapuraanam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "harivanshapuraanam",
"destination_collection_name": "harivanshapuraanam"
},
"kurmapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "kurmapuranam",
"destination_collection_name": "kurmapuranam"
},
"lingapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "lingapuranam",
"destination_collection_name": "lingapuranam"
},
"markandeypuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "markandeypuranam",
"destination_collection_name": "markandeypuranam"
},
"matsyapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "matsyapuranam",
"destination_collection_name": "matsyapuranam"
},
"naradapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "naradapuranam",
"destination_collection_name": "naradapuranam"
},
"padmapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "padmapuranam",
"destination_collection_name": "padmapuranam"
},
"shivapuraanam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "shivapuraanam",
"destination_collection_name": "shivapuraanam"
},
"skandapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "skandapuranam",
"destination_collection_name": "skandapuranam"
},
"vaamanapuraanam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "vaamanapuraanam",
"destination_collection_name": "vaamanapuraanam"
},
"vaayupuraanam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "vaayupuraanam",
"destination_collection_name": "vaayupuraanam"
},
"varahapuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "varahapuranam",
"destination_collection_name": "varahapuranam"
},
"vishnupuranam": {
"source_db_path": "../puranas_ai/chromadb_store",
"source_collection_name": "vishnupuranam",
"destination_collection_name": "vishnu_puranam_openai"
},
"vaazhi_thirunaamams": {
"source_db_path": "../vaazhi_thirunamam/chromadb_store",
"source_collection_name": "vaazhi_thirunaamams",
"destination_collection_name": "vaazhi_thirunaamams"
},
"upadesa_rathnamalai": {
"source_db_path": "../upadesa_rathnamalai/chromadb_store",
"source_collection_name": "upadesa_rathnamalai",
"destination_collection_name": "upadesa_rathnamalai"
},
"thiruvaimozhi_nootrandhadhi": {
"source_db_path": "../thiruvaimozhi_nootrandhadhi_ai/chromadb_store",
"source_collection_name": "thiruvaimozhi_nootrandhadhi",
"destination_collection_name": "thiruvaimozhi_nootrandhadhi"
},
"devaraja_ashtakam": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "devaraja_ashtakam",
"destination_collection_name": "devaraja_ashtakam"
},
"geethartha_sangraha": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "geethartha_sangraha",
"destination_collection_name": "geethartha_sangraha"
},
"mukunda_mala": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "mukunda_mala",
"destination_collection_name": "mukunda_mala"
},
"narasimha_ashtakam": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "narasimha_ashtakam",
"destination_collection_name": "narasimha_ashtakam"
},
"panchayudha_stotram": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "panchayudha_stotram",
"destination_collection_name": "panchayudha_stotram"
},
"ranganatha_stotram": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "ranganatha_stotram",
"destination_collection_name": "ranganatha_stotram"
},
"devaraja_mangalam": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "devaraja_mangalam",
"destination_collection_name": "devaraja_mangalam"
},
"dhati_panchakam": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "dhati_panchakam",
"destination_collection_name": "dhati_panchakam"
},
"mukthaka_mangalam": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "mukthaka_mangalam",
"destination_collection_name": "mukthaka_mangalam"
},
"venkateswara_stotram": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "venkateswara_stotram",
"destination_collection_name": "venkateswara_stotram"
},
"yathiraja_vimsathi": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "yathiraja_vimsathi",
"destination_collection_name": "yathiraja_vimsathi"
},
"gadhyathrayam": {
"source_db_path": "../stotra_patam_ai/chromadb_store",
"source_collection_name": "gadhyathrayam",
"destination_collection_name": "gadhyathrayam"
},
"aarthi_prabandham": {
"source_db_path": "../aarthi_prabandham_ai/chromadb_store",
"source_collection_name": "aarthi_prabandham",
"destination_collection_name": "aarthi_prabandham"
},
"padhuka_sahasram": {
"source_db_path": "../padhuka_sahasram/chromadb_store",
"source_collection_name": "padhuka_sahasram",
"destination_collection_name": "padhuka_sahasram"
},
}
parser = argparse.ArgumentParser(description="My app with database parameter")
parser.add_argument(
"--db",
type=str,
required=True,
choices=list(db_config.keys()),
help=f"Id of the database to use. allowed_values : {', '.join(db_config.keys())}",
)
args = parser.parse_args()
db_id = args.db
if db_id is None:
raise Exception(f"No db provided!")
if db_id not in db_config:
raise Exception(f"db with id {db_id} not found!")
# Connect to source and destination local persistent clients
source_client = chromadb.PersistentClient(path=db_config[db_id]["source_db_path"])
destination_client = chromadb.PersistentClient(path="./chromadb-store")
source_collection_name = db_config[db_id]["source_collection_name"]
destination_collection_name = db_config[db_id]["destination_collection_name"]
# Get the source collection
source_collection = source_client.get_collection(source_collection_name)
# Retrieve all data from the source collection
source_data = source_collection.get(include=["documents", "metadatas", "embeddings"])
# Create or get the destination collection
if destination_client.get_or_create_collection(destination_collection_name):
print("Deleting existing collection", destination_collection_name)
destination_client.delete_collection(destination_collection_name)
destination_collection = destination_client.get_or_create_collection(
destination_collection_name,
metadata=source_collection.metadata, # Copy metadata if needed
)
# Add data to the destination collection in batches
BATCH_SIZE = 500
total_records = len(source_data["ids"])
print(f"Copying {total_records} records in batches of {BATCH_SIZE}...")
for i in tqdm(range(0, total_records, BATCH_SIZE)):
batch_ids = source_data["ids"][i : i + BATCH_SIZE]
batch_docs = source_data["documents"][i : i + BATCH_SIZE]
batch_metas = source_data["metadatas"][i : i + BATCH_SIZE]
batch_embeds = (
source_data["embeddings"][i : i + BATCH_SIZE]
if "embeddings" in source_data and source_data["embeddings"] is not None
else None
)
destination_collection.add(
ids=batch_ids,
documents=batch_docs,
metadatas=batch_metas,
embeddings=batch_embeds,
)
print("✅ Collection copied successfully!")
print("Total records in source collection = ", source_collection.count())
print("Total records in destination collection = ", destination_collection.count())