Clause-AI / data /ingest_hierarchy.py
Kan05's picture
Upload 9 files
87553a7 verified
import os
import uuid
import torch
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from supabase import create_client
# LangChain Imports
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
# 1. Setup
load_dotenv()
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
print(SUPABASE_URL, SUPABASE_KEY)
if not SUPABASE_URL or not SUPABASE_KEY:
raise ValueError("❌ Check your .env file!")
def ingest_jina_8k():
print("🚀 Initializing Jina v2 (8k Context) on GPU...")
# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"⚙️ Running on: {device.upper()}")
# 2. Load Model (The Magic Part)
embeddings = HuggingFaceEmbeddings(
model_name="jinaai/jina-embeddings-v2-base-en",
model_kwargs={'device': device, 'trust_remote_code': True}, # Jina needs trust_remote_code
encode_kwargs={'normalize_embeddings': True}
)
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
# 3. Text Splitters (Optimized for Jina)
# Since Jina handles 8k tokens, we can make the PARENT chunk huge.
# 4000 characters is ~1000 tokens. We can go even bigger safely.
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
# Children for search still need to be precise
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
# 4. Find Files
BASE_PATH = "CUAD_v1/full_contract_txt"
file_paths = []
for root, dirs, files in os.walk(BASE_PATH):
for file in files:
if file.endswith(".txt"):
file_paths.append(os.path.join(root, file))
print(f"🔍 Found {len(file_paths)} contracts.")
# 5. Processing Loop
for file_path in tqdm(file_paths, desc="Ingesting"):
try:
# Metadata Logic
path_parts = Path(file_path).parts
category = path_parts[-2] if len(path_parts) > 2 else "General"
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
if len(text) < 100: continue
# Create Parent Documents
parent_chunks = parent_splitter.create_documents([text])
for parent in parent_chunks:
parent_uuid = str(uuid.uuid4())
# A. Upload Parent (Context)
supabase.table("parent_documents").insert({
"id": parent_uuid,
"content": parent.page_content,
"metadata": {
"source": os.path.basename(file_path),
"category": category,
"model": "jina-v2-base-en"
}
}).execute()
# B. Create & Embed Children (Search)
child_chunks = child_splitter.create_documents([parent.page_content])
child_texts = [c.page_content for c in child_chunks]
if child_texts:
# Embed batch on GPU
vectors = embeddings.embed_documents(child_texts)
payload = []
for i, vector in enumerate(vectors):
payload.append({
"content": child_texts[i],
"embedding": vector,
"parent_id": parent_uuid,
"metadata": {"chunk_index": i}
})
if payload:
supabase.table("child_vectors").insert(payload).execute()
except Exception as e:
print(f"❌ Error on {file_path}: {e}")
continue
print("✅ Ingestion Complete. You now have an 8K-context legal search engine.")
if __name__ == "__main__":
ingest_jina_8k()