# ai-service/scripts/train_creative.py import os import sys import json import csv # Path setup current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) sys.path.append(parent_dir) from core.rag.store import VectorStore def ingest_creative_data(): print("\nšŸš€ Starting ADVANCED Creative Director Training...") # 1. Initialize DB store = VectorStore(collection_name="creative_mind") data_folder = os.path.join(parent_dir, "data", "creative_training") documents = [] metadatas = [] ids = [] if not os.path.exists(data_folder): print(f"āŒ Folder missing: {data_folder}") return # 2. Iterate over files files_found = 0 for filename in os.listdir(data_folder): file_path = os.path.join(data_folder, filename) try: # --- CASE A: TEXT FILES --- if filename.endswith(".txt"): print(f" šŸ“„ Reading Text: {filename}") with open(file_path, "r", encoding="utf-8") as f: content = f.read() if content.strip(): # Text file ko chunks me tod sakte hain agar badi ho # Abhi simple rakhte hain documents.append(content) metadatas.append({"source": filename, "type": "text_guide"}) ids.append(f"{filename}_full") files_found += 1 # --- CASE B: JSON FILES (GitHub Style) --- elif filename.endswith(".json"): print(f" ✨ Reading JSON: {filename}") with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) # Agar list of objects hai (Common in datasets) if isinstance(data, list): for idx, item in enumerate(data): # JSON object ko text string me convert karte hain taaki vector ban sake text_repr = f"Strategy: {item.get('title', 'Tip')}\nDetails: {item.get('content', item)}" documents.append(text_repr) metadatas.append({"source": filename, "type": "json_entry"}) ids.append(f"{filename}_{idx}") files_found += 1 # --- CASE C: CSV FILES (Excel Style) --- elif filename.endswith(".csv"): print(f" šŸ“Š Reading CSV: {filename}") with open(file_path, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for idx, row in enumerate(reader): # Dictionary ko readable string banao # Example row: {'hook': 'Stop!', 'niche': 'Tech'} # Result: "hook: Stop! \n niche: Tech" text_repr = "\n".join([f"{k}: {v}" for k, v in row.items()]) documents.append(text_repr) metadatas.append({"source": filename, "type": "csv_row"}) ids.append(f"{filename}_{idx}") files_found += 1 except Exception as e: print(f" āš ļø Error processing {filename}: {e}") # 3. Save to Database if documents: print(f"\n 🧠 Embedding {len(documents)} data points into Vector DB...") # Batch processing agar data bohot zyada ho batch_size = 50 for i in range(0, len(documents), batch_size): end = min(i + batch_size, len(documents)) print(f" - Batch {i} to {end}...") store.add_text(documents[i:end], metadatas[i:end], ids[i:end]) print(f"āœ… Training Complete! Scanned {files_found} files.") else: print("āš ļø No valid data found. Add .txt, .json, or .csv files.") if __name__ == "__main__": ingest_creative_data()