Spaces:

Kan05
/

Clause-AI

Running

App Files Files Community

Clause-AI / data /ingest_hierarchy.py

Kan05

Upload 9 files

87553a7 verified about 1 month ago

raw

history blame contribute delete

4.12 kB

	import os
	import uuid
	import torch
	from pathlib import Path
	from tqdm import tqdm
	from dotenv import load_dotenv
	from supabase import create_client

	# LangChain Imports
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# 1. Setup
	load_dotenv()
	SUPABASE_URL = os.getenv("SUPABASE_URL")
	SUPABASE_KEY = os.getenv("SUPABASE_KEY")
	print(SUPABASE_URL, SUPABASE_KEY)
	if not SUPABASE_URL or not SUPABASE_KEY:
	raise ValueError("❌ Check your .env file!")

	def ingest_jina_8k():
	print("🚀 Initializing Jina v2 (8k Context) on GPU...")

	# Check for GPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"⚙️ Running on: {device.upper()}")

	# 2. Load Model (The Magic Part)
	embeddings = HuggingFaceEmbeddings(
	model_name="jinaai/jina-embeddings-v2-base-en",
	model_kwargs={'device': device, 'trust_remote_code': True}, # Jina needs trust_remote_code
	encode_kwargs={'normalize_embeddings': True}
	)

	supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

	# 3. Text Splitters (Optimized for Jina)
	# Since Jina handles 8k tokens, we can make the PARENT chunk huge.
	# 4000 characters is ~1000 tokens. We can go even bigger safely.
	parent_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)

	# Children for search still need to be precise
	child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

	# 4. Find Files
	BASE_PATH = "CUAD_v1/full_contract_txt"
	file_paths = []
	for root, dirs, files in os.walk(BASE_PATH):
	for file in files:
	if file.endswith(".txt"):
	file_paths.append(os.path.join(root, file))

	print(f"🔍 Found {len(file_paths)} contracts.")

	# 5. Processing Loop
	for file_path in tqdm(file_paths, desc="Ingesting"):
	try:
	# Metadata Logic
	path_parts = Path(file_path).parts
	category = path_parts[-2] if len(path_parts) > 2 else "General"

	with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
	text = f.read()

	if len(text) < 100: continue

	# Create Parent Documents
	parent_chunks = parent_splitter.create_documents([text])

	for parent in parent_chunks:
	parent_uuid = str(uuid.uuid4())

	# A. Upload Parent (Context)
	supabase.table("parent_documents").insert({
	"id": parent_uuid,
	"content": parent.page_content,
	"metadata": {
	"source": os.path.basename(file_path),
	"category": category,
	"model": "jina-v2-base-en"
	}
	}).execute()

	# B. Create & Embed Children (Search)
	child_chunks = child_splitter.create_documents([parent.page_content])
	child_texts = [c.page_content for c in child_chunks]

	if child_texts:
	# Embed batch on GPU
	vectors = embeddings.embed_documents(child_texts)

	payload = []
	for i, vector in enumerate(vectors):
	payload.append({
	"content": child_texts[i],
	"embedding": vector,
	"parent_id": parent_uuid,
	"metadata": {"chunk_index": i}
	})

	if payload:
	supabase.table("child_vectors").insert(payload).execute()

	except Exception as e:
	print(f"❌ Error on {file_path}: {e}")
	continue

	print("✅ Ingestion Complete. You now have an 8K-context legal search engine.")

	if __name__ == "__main__":
	ingest_jina_8k()