|
|
import os
|
|
|
import uuid
|
|
|
import torch
|
|
|
from pathlib import Path
|
|
|
from tqdm import tqdm
|
|
|
from dotenv import load_dotenv
|
|
|
from supabase import create_client
|
|
|
|
|
|
|
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
|
|
|
|
|
load_dotenv()
|
|
|
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
|
|
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
|
|
|
print(SUPABASE_URL, SUPABASE_KEY)
|
|
|
if not SUPABASE_URL or not SUPABASE_KEY:
|
|
|
raise ValueError("❌ Check your .env file!")
|
|
|
|
|
|
def ingest_jina_8k():
|
|
|
print("🚀 Initializing Jina v2 (8k Context) on GPU...")
|
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
print(f"⚙️ Running on: {device.upper()}")
|
|
|
|
|
|
|
|
|
embeddings = HuggingFaceEmbeddings(
|
|
|
model_name="jinaai/jina-embeddings-v2-base-en",
|
|
|
model_kwargs={'device': device, 'trust_remote_code': True},
|
|
|
encode_kwargs={'normalize_embeddings': True}
|
|
|
)
|
|
|
|
|
|
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
|
|
|
|
|
|
|
|
|
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
|
|
|
|
|
|
|
|
BASE_PATH = "CUAD_v1/full_contract_txt"
|
|
|
file_paths = []
|
|
|
for root, dirs, files in os.walk(BASE_PATH):
|
|
|
for file in files:
|
|
|
if file.endswith(".txt"):
|
|
|
file_paths.append(os.path.join(root, file))
|
|
|
|
|
|
print(f"🔍 Found {len(file_paths)} contracts.")
|
|
|
|
|
|
|
|
|
for file_path in tqdm(file_paths, desc="Ingesting"):
|
|
|
try:
|
|
|
|
|
|
path_parts = Path(file_path).parts
|
|
|
category = path_parts[-2] if len(path_parts) > 2 else "General"
|
|
|
|
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
|
text = f.read()
|
|
|
|
|
|
if len(text) < 100: continue
|
|
|
|
|
|
|
|
|
parent_chunks = parent_splitter.create_documents([text])
|
|
|
|
|
|
for parent in parent_chunks:
|
|
|
parent_uuid = str(uuid.uuid4())
|
|
|
|
|
|
|
|
|
supabase.table("parent_documents").insert({
|
|
|
"id": parent_uuid,
|
|
|
"content": parent.page_content,
|
|
|
"metadata": {
|
|
|
"source": os.path.basename(file_path),
|
|
|
"category": category,
|
|
|
"model": "jina-v2-base-en"
|
|
|
}
|
|
|
}).execute()
|
|
|
|
|
|
|
|
|
child_chunks = child_splitter.create_documents([parent.page_content])
|
|
|
child_texts = [c.page_content for c in child_chunks]
|
|
|
|
|
|
if child_texts:
|
|
|
|
|
|
vectors = embeddings.embed_documents(child_texts)
|
|
|
|
|
|
payload = []
|
|
|
for i, vector in enumerate(vectors):
|
|
|
payload.append({
|
|
|
"content": child_texts[i],
|
|
|
"embedding": vector,
|
|
|
"parent_id": parent_uuid,
|
|
|
"metadata": {"chunk_index": i}
|
|
|
})
|
|
|
|
|
|
if payload:
|
|
|
supabase.table("child_vectors").insert(payload).execute()
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Error on {file_path}: {e}")
|
|
|
continue
|
|
|
|
|
|
print("✅ Ingestion Complete. You now have an 8K-context legal search engine.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
ingest_jina_8k() |