Spaces:

prodevroger
/

ishingiro

Sleeping

ishingiro / ingest.py

IZERE HIRWA Roger

ishingiro

c024705 3 months ago

3.79 kB

	import os, uuid, json
	from pathlib import Path
	# Replace ollama import with OpenAI client
	from openai import OpenAI
	from pypdf import PdfReader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from dotenv import load_dotenv

	load_dotenv()

	DATA_DIR = Path("data")
	EMBED_FILE = Path("storage/embeddings.json")
	EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text")
	OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434/v1")
	OLLAMA_API_KEY = os.getenv("OLLAMA_API_KEY", "ollama")

	# Initialize OpenAI client for Ollama
	openai_client = OpenAI(
	base_url=OLLAMA_BASE_URL,
	api_key=OLLAMA_API_KEY
	)

	# --- Load or initialize embeddings ---
	if EMBED_FILE.exists():
	with open(EMBED_FILE, "r", encoding="utf-8") as f:
	chunks_data = json.load(f)
	else:
	chunks_data = []

	# --- Helper functions ---
	def load_text_from_file(path: Path) -> str:
	if path.suffix.lower() in [".txt", ".md"]:
	return path.read_text(encoding="utf-8", errors="ignore")
	if path.suffix.lower() == ".pdf":
	pdf = PdfReader(str(path))
	return "\n".join((page.extract_text() or "") for page in pdf.pages)
	return ""

	def chunk_text(text: str):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=900, chunk_overlap=150,
	separators=["\n\n", "\n", " ", ""]
	)
	return splitter.split_text(text)

	# --- Track existing sources ---
	existing_files = {c["source"] for c in chunks_data}

	new_chunks = []
	for fp in DATA_DIR.glob("*/"):
	if fp.suffix.lower() not in [".pdf", ".txt", ".md"]:
	continue
	if fp.name in existing_files:
	continue # skip already processed files

	raw = load_text_from_file(fp)
	if not raw.strip():
	continue

	for idx, piece in enumerate(chunk_text(raw)):
	new_chunks.append({
	"id": str(uuid.uuid4()),
	"text": piece,
	"source": fp.name,
	"chunk": idx,
	"embedding": None # to fill below
	})

	# --- Generate embeddings with OpenAI client ---
	if new_chunks:
	texts = [c["text"] for c in new_chunks]

	# Generate embeddings using OpenAI client
	embeddings = []
	batch_size = 32 # Process in batches for better performance

	for i in range(0, len(texts), batch_size):
	batch = texts[i:i + batch_size]
	try:
	# OpenAI client supports batch processing
	response = openai_client.embeddings.create(
	model=EMBED_MODEL,
	input=batch
	)
	batch_embeddings = [item.embedding for item in response.data]
	embeddings.extend(batch_embeddings)
	print(f"Processed batch {i//batch_size + 1}/{(len(texts) + batch_size - 1)//batch_size}")
	except Exception as e:
	print(f"Error embedding batch: {e}")
	# Fallback: process individually
	for text in batch:
	try:
	response = openai_client.embeddings.create(
	model=EMBED_MODEL,
	input=text
	)
	embeddings.append(response.data[0].embedding)
	except Exception as e2:
	print(f"Error embedding individual text: {e2}")
	embeddings.append([0.0] * 384) # fallback with correct dimension

	for c, e in zip(new_chunks, embeddings):
	c["embedding"] = e

	chunks_data.extend(new_chunks)

	# Save updated embeddings
	EMBED_FILE.parent.mkdir(parents=True, exist_ok=True)
	with open(EMBED_FILE, "w", encoding="utf-8") as f:
	json.dump(chunks_data, f, ensure_ascii=False, indent=2)

	print(f"Added {len(new_chunks)} new chunks to {EMBED_FILE}")
	else:
	print("No new documents found.")