Spaces:
Sleeping
Sleeping
| import os, uuid, json | |
| from pathlib import Path | |
| # Replace ollama import with OpenAI client | |
| from openai import OpenAI | |
| from pypdf import PdfReader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| DATA_DIR = Path("data") | |
| EMBED_FILE = Path("storage/embeddings.json") | |
| EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text") | |
| OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434/v1") | |
| OLLAMA_API_KEY = os.getenv("OLLAMA_API_KEY", "ollama") | |
| # Initialize OpenAI client for Ollama | |
| openai_client = OpenAI( | |
| base_url=OLLAMA_BASE_URL, | |
| api_key=OLLAMA_API_KEY | |
| ) | |
| # --- Load or initialize embeddings --- | |
| if EMBED_FILE.exists(): | |
| with open(EMBED_FILE, "r", encoding="utf-8") as f: | |
| chunks_data = json.load(f) | |
| else: | |
| chunks_data = [] | |
| # --- Helper functions --- | |
| def load_text_from_file(path: Path) -> str: | |
| if path.suffix.lower() in [".txt", ".md"]: | |
| return path.read_text(encoding="utf-8", errors="ignore") | |
| if path.suffix.lower() == ".pdf": | |
| pdf = PdfReader(str(path)) | |
| return "\n".join((page.extract_text() or "") for page in pdf.pages) | |
| return "" | |
| def chunk_text(text: str): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=900, chunk_overlap=150, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| return splitter.split_text(text) | |
| # --- Track existing sources --- | |
| existing_files = {c["source"] for c in chunks_data} | |
| new_chunks = [] | |
| for fp in DATA_DIR.glob("**/*"): | |
| if fp.suffix.lower() not in [".pdf", ".txt", ".md"]: | |
| continue | |
| if fp.name in existing_files: | |
| continue # skip already processed files | |
| raw = load_text_from_file(fp) | |
| if not raw.strip(): | |
| continue | |
| for idx, piece in enumerate(chunk_text(raw)): | |
| new_chunks.append({ | |
| "id": str(uuid.uuid4()), | |
| "text": piece, | |
| "source": fp.name, | |
| "chunk": idx, | |
| "embedding": None # to fill below | |
| }) | |
| # --- Generate embeddings with OpenAI client --- | |
| if new_chunks: | |
| texts = [c["text"] for c in new_chunks] | |
| # Generate embeddings using OpenAI client | |
| embeddings = [] | |
| batch_size = 32 # Process in batches for better performance | |
| for i in range(0, len(texts), batch_size): | |
| batch = texts[i:i + batch_size] | |
| try: | |
| # OpenAI client supports batch processing | |
| response = openai_client.embeddings.create( | |
| model=EMBED_MODEL, | |
| input=batch | |
| ) | |
| batch_embeddings = [item.embedding for item in response.data] | |
| embeddings.extend(batch_embeddings) | |
| print(f"Processed batch {i//batch_size + 1}/{(len(texts) + batch_size - 1)//batch_size}") | |
| except Exception as e: | |
| print(f"Error embedding batch: {e}") | |
| # Fallback: process individually | |
| for text in batch: | |
| try: | |
| response = openai_client.embeddings.create( | |
| model=EMBED_MODEL, | |
| input=text | |
| ) | |
| embeddings.append(response.data[0].embedding) | |
| except Exception as e2: | |
| print(f"Error embedding individual text: {e2}") | |
| embeddings.append([0.0] * 384) # fallback with correct dimension | |
| for c, e in zip(new_chunks, embeddings): | |
| c["embedding"] = e | |
| chunks_data.extend(new_chunks) | |
| # Save updated embeddings | |
| EMBED_FILE.parent.mkdir(parents=True, exist_ok=True) | |
| with open(EMBED_FILE, "w", encoding="utf-8") as f: | |
| json.dump(chunks_data, f, ensure_ascii=False, indent=2) | |
| print(f"Added {len(new_chunks)} new chunks to {EMBED_FILE}") | |
| else: | |
| print("No new documents found.") | |