import os import uuid import torch from pathlib import Path from tqdm import tqdm from dotenv import load_dotenv from supabase import create_client # LangChain Imports from langchain_huggingface import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter # 1. Setup load_dotenv() SUPABASE_URL = os.getenv("SUPABASE_URL") SUPABASE_KEY = os.getenv("SUPABASE_KEY") print(SUPABASE_URL, SUPABASE_KEY) if not SUPABASE_URL or not SUPABASE_KEY: raise ValueError("❌ Check your .env file!") def ingest_jina_8k(): print("🚀 Initializing Jina v2 (8k Context) on GPU...") # Check for GPU device = "cuda" if torch.cuda.is_available() else "cpu" print(f"⚙️ Running on: {device.upper()}") # 2. Load Model (The Magic Part) embeddings = HuggingFaceEmbeddings( model_name="jinaai/jina-embeddings-v2-base-en", model_kwargs={'device': device, 'trust_remote_code': True}, # Jina needs trust_remote_code encode_kwargs={'normalize_embeddings': True} ) supabase = create_client(SUPABASE_URL, SUPABASE_KEY) # 3. Text Splitters (Optimized for Jina) # Since Jina handles 8k tokens, we can make the PARENT chunk huge. # 4000 characters is ~1000 tokens. We can go even bigger safely. parent_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200) # Children for search still need to be precise child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) # 4. Find Files BASE_PATH = "CUAD_v1/full_contract_txt" file_paths = [] for root, dirs, files in os.walk(BASE_PATH): for file in files: if file.endswith(".txt"): file_paths.append(os.path.join(root, file)) print(f"🔍 Found {len(file_paths)} contracts.") # 5. Processing Loop for file_path in tqdm(file_paths, desc="Ingesting"): try: # Metadata Logic path_parts = Path(file_path).parts category = path_parts[-2] if len(path_parts) > 2 else "General" with open(file_path, "r", encoding="utf-8", errors="ignore") as f: text = f.read() if len(text) < 100: continue # Create Parent Documents parent_chunks = parent_splitter.create_documents([text]) for parent in parent_chunks: parent_uuid = str(uuid.uuid4()) # A. Upload Parent (Context) supabase.table("parent_documents").insert({ "id": parent_uuid, "content": parent.page_content, "metadata": { "source": os.path.basename(file_path), "category": category, "model": "jina-v2-base-en" } }).execute() # B. Create & Embed Children (Search) child_chunks = child_splitter.create_documents([parent.page_content]) child_texts = [c.page_content for c in child_chunks] if child_texts: # Embed batch on GPU vectors = embeddings.embed_documents(child_texts) payload = [] for i, vector in enumerate(vectors): payload.append({ "content": child_texts[i], "embedding": vector, "parent_id": parent_uuid, "metadata": {"chunk_index": i} }) if payload: supabase.table("child_vectors").insert(payload).execute() except Exception as e: print(f"❌ Error on {file_path}: {e}") continue print("✅ Ingestion Complete. You now have an 8K-context legal search engine.") if __name__ == "__main__": ingest_jina_8k()