Odin / src /rag /build_volve_db.py
ODIN
Initial commit: ODIN multi-agent drilling intelligence system
67e93c9
"""
build_volve_db.py
-----------------
Builds a combined Volve History & Geophysics Vector DB.
Includes:
1. Structured DDR Activity Narratives
2. Geological Formation Picks (Geophysical Interpretations)
"""
import os
import time
import shutil
import logging
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
BASE_DIR = Path(__file__).resolve().parents[2]
DATA_DIR = BASE_DIR / "data" / "processed"
DB_DIR = BASE_DIR / "data" / "knowledge_base" / "volve_ddr_history"
DDR_CSV = DATA_DIR / "ddr" / "_ddr_all_activities.csv"
PICKS_CSV = DATA_DIR / "serialized_text" / "well_picks_narratives.csv"
def build_combined_db():
documents = []
# 1. Ingest DDR Activities
if DDR_CSV.exists():
logger.info(f"Loading DDR activities from {DDR_CSV}...")
df_ddr = pd.read_csv(DDR_CSV).fillna("")
for idx, row in tqdm(df_ddr.iterrows(), total=len(df_ddr), desc="DDR"):
well = str(row.get("well_name", ""))
date = str(row.get("act_start", ""))[:10]
comm = str(row.get("comments", "")).strip()
state = str(row.get("state", ""))
if not comm and state == "ok": continue
content = f"Date: {date}\nWell: {well}\nActivity: {row.get('activity_code','')}\nDepth: {row.get('md_m','')}m\nComments: {comm}"
metadata = {"source": "DDR", "well": well, "date": date, "type": "activity"}
documents.append(Document(page_content=content, metadata=metadata))
# 2. Ingest Well Picks (Geophysics)
if PICKS_CSV.exists():
logger.info(f"Loading Well Picks from {PICKS_CSV}...")
df_picks = pd.read_csv(PICKS_CSV)
for idx, row in tqdm(df_picks.iterrows(), total=len(df_picks), desc="Picks"):
content = row["text"]
# Extract well name from narrative for metadata if possible
well_match = re.search(r"Well ([\w\s/-]+),", content)
well = well_match.group(1) if well_match else "Unknown"
metadata = {"source": "Geophysics", "well": well, "type": "formation_pick"}
documents.append(Document(page_content=content, metadata=metadata))
if not documents:
logger.error("No documents found to index.")
return
# Clear existing
if DB_DIR.exists():
shutil.rmtree(DB_DIR)
# Embeddings
logger.info("Initializing HuggingFaceEmbeddings...")
embeddings = HuggingFaceEmbeddings(
model_name="Octen/Octen-Embedding-0.6B",
model_kwargs={'device': 'cuda', 'trust_remote_code': True},
encode_kwargs={'normalize_embeddings': True}
)
# Vector Store
logger.info(f"Building combined Vector DB at {DB_DIR} with {len(documents)} docs...")
vectorstore = Chroma(persist_directory=str(DB_DIR), embedding_function=embeddings)
batch_size = 1000
for i in tqdm(range(0, len(documents), batch_size), desc="Indexing"):
vectorstore.add_documents(documents[i:i + batch_size])
logger.info("✅ Successfully built combined Volve History & Geophysics DB.")
import re
if __name__ == "__main__":
t0 = time.time()
build_combined_db()
logger.info(f"Total time: {time.time() - t0:.1f}s")