import os import logging import sys import time import requests import subprocess from pathlib import Path from tqdm import tqdm from langchain_community.document_loaders import DirectoryLoader, TextLoader # <--- SWITCHED from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_community.embeddings import OllamaEmbeddings # --- LOGGING --- logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(message)s', force=True) logger = logging.getLogger("ReportIngest") def _ensure_ollama_running(port="25000"): host = f"http://127.0.0.1:{port}" try: if requests.get(host).status_code == 200: return True except: pass print(" Starting Ollama Server...") scratch = os.environ.get("SCRATCH", "/tmp") base = Path(scratch) bin_path = base / "ollama_core/bin/ollama" env = os.environ.copy() env["OLLAMA_HOST"] = f"127.0.0.1:{port}" env["OLLAMA_MODELS"] = str(base / "ollama_core/models") subprocess.Popen([str(bin_path), "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=env) time.sleep(5) return True def ingest_markdown_reports( markdown_dir="mshauri-fedha/data/knbs/marker-output", vector_db_path="mshauri_fedha_chroma_db", model="nomic-embed-text", ollama_port="25000" ): _ensure_ollama_running(ollama_port) if not os.path.exists(markdown_dir): logger.error(f" Directory not found: {markdown_dir}") return print(f"Scanning for Markdown Reports in {markdown_dir}...") # --- 1. LOAD FILES (Improved) --- # We use TextLoader which is faster and doesn't trigger 'unstructured' warnings loader = DirectoryLoader( markdown_dir, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True}, # Safe for varying file encodings show_progress=True, use_multithreading=True ) # Catch errors during loading (e.g., empty files) try: raw_docs = loader.load() except Exception as e: print(f" Warning during loading: {e}") # Fallback: simple load if directory loader fails raw_docs = [] if not raw_docs: print(" No valid markdown files found.") return print(f" Loaded {len(raw_docs)} report files.") # --- 2. CHUNKING --- text_splitter = RecursiveCharacterTextSplitter( chunk_size=2000, chunk_overlap=200, separators=["\n## ", "\n### ", "\n", " ", ""] ) docs = text_splitter.split_documents(raw_docs) # --- 3. METADATA --- for d in docs: d.metadata["type"] = "report" if "source" not in d.metadata: d.metadata["source"] = os.path.basename(d.metadata.get("source", "Official Report")) print(f" Split into {len(docs)} chunks.") # --- 4. EMBEDDING --- print(" Appending to Vector Store...") embeddings = OllamaEmbeddings( model=model, base_url=f"http://127.0.0.1:{ollama_port}" ) vectorstore = Chroma( persist_directory=vector_db_path, embedding_function=embeddings ) # Batch Add batch_size = 100 with tqdm(total=len(docs), desc="Ingesting Reports", unit="chunk") as pbar: for i in range(0, len(docs), batch_size): batch = docs[i:i+batch_size] vectorstore.add_documents(batch) pbar.update(len(batch)) print("\n Reports Added. Hybrid Knowledge Base is ready.") if __name__ == "__main__": ingest_markdown_reports()