File size: 4,124 Bytes
87553a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import uuid
import torch
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from supabase import create_client

# LangChain Imports
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 1. Setup
load_dotenv()
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
print(SUPABASE_URL, SUPABASE_KEY)
if not SUPABASE_URL or not SUPABASE_KEY:
    raise ValueError("❌ Check your .env file!")

def ingest_jina_8k():
    print("🚀 Initializing Jina v2 (8k Context) on GPU...")
    
    # Check for GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"⚙️ Running on: {device.upper()}")

    # 2. Load Model (The Magic Part)
    embeddings = HuggingFaceEmbeddings(
        model_name="jinaai/jina-embeddings-v2-base-en",
        model_kwargs={'device': device, 'trust_remote_code': True}, # Jina needs trust_remote_code
        encode_kwargs={'normalize_embeddings': True} 
    )

    supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

    # 3. Text Splitters (Optimized for Jina)
    # Since Jina handles 8k tokens, we can make the PARENT chunk huge.
    # 4000 characters is ~1000 tokens. We can go even bigger safely.
    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
    
    # Children for search still need to be precise
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    # 4. Find Files
    BASE_PATH = "CUAD_v1/full_contract_txt"
    file_paths = []
    for root, dirs, files in os.walk(BASE_PATH):
        for file in files:
            if file.endswith(".txt"):
                file_paths.append(os.path.join(root, file))
    
    print(f"🔍 Found {len(file_paths)} contracts.")

    # 5. Processing Loop
    for file_path in tqdm(file_paths, desc="Ingesting"):
        try:
            # Metadata Logic
            path_parts = Path(file_path).parts
            category = path_parts[-2] if len(path_parts) > 2 else "General"

            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()

            if len(text) < 100: continue

            # Create Parent Documents
            parent_chunks = parent_splitter.create_documents([text])

            for parent in parent_chunks:
                parent_uuid = str(uuid.uuid4())
                
                # A. Upload Parent (Context)
                supabase.table("parent_documents").insert({
                    "id": parent_uuid,
                    "content": parent.page_content,
                    "metadata": {
                        "source": os.path.basename(file_path), 
                        "category": category,
                        "model": "jina-v2-base-en"
                    }
                }).execute()

                # B. Create & Embed Children (Search)
                child_chunks = child_splitter.create_documents([parent.page_content])
                child_texts = [c.page_content for c in child_chunks]

                if child_texts:
                    # Embed batch on GPU
                    vectors = embeddings.embed_documents(child_texts)
                    
                    payload = []
                    for i, vector in enumerate(vectors):
                        payload.append({
                            "content": child_texts[i],
                            "embedding": vector,
                            "parent_id": parent_uuid,
                            "metadata": {"chunk_index": i}
                        })
                    
                    if payload:
                        supabase.table("child_vectors").insert(payload).execute()

        except Exception as e:
            print(f"❌ Error on {file_path}: {e}")
            continue

    print("✅ Ingestion Complete. You now have an 8K-context legal search engine.")

if __name__ == "__main__":
    ingest_jina_8k()