| """ |
| build_volve_db.py |
| ----------------- |
| Builds a combined Volve History & Geophysics Vector DB. |
| Includes: |
| 1. Structured DDR Activity Narratives |
| 2. Geological Formation Picks (Geophysical Interpretations) |
| """ |
|
|
| import os |
| import time |
| import shutil |
| import logging |
| import pandas as pd |
| from pathlib import Path |
| from tqdm import tqdm |
|
|
| from langchain_core.documents import Document |
| from langchain_chroma import Chroma |
| from langchain_huggingface import HuggingFaceEmbeddings |
|
|
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| BASE_DIR = Path(__file__).resolve().parents[2] |
| DATA_DIR = BASE_DIR / "data" / "processed" |
| DB_DIR = BASE_DIR / "data" / "knowledge_base" / "volve_ddr_history" |
| DDR_CSV = DATA_DIR / "ddr" / "_ddr_all_activities.csv" |
| PICKS_CSV = DATA_DIR / "serialized_text" / "well_picks_narratives.csv" |
|
|
| def build_combined_db(): |
| documents = [] |
|
|
| |
| if DDR_CSV.exists(): |
| logger.info(f"Loading DDR activities from {DDR_CSV}...") |
| df_ddr = pd.read_csv(DDR_CSV).fillna("") |
| for idx, row in tqdm(df_ddr.iterrows(), total=len(df_ddr), desc="DDR"): |
| well = str(row.get("well_name", "")) |
| date = str(row.get("act_start", ""))[:10] |
| comm = str(row.get("comments", "")).strip() |
| state = str(row.get("state", "")) |
| if not comm and state == "ok": continue |
| |
| content = f"Date: {date}\nWell: {well}\nActivity: {row.get('activity_code','')}\nDepth: {row.get('md_m','')}m\nComments: {comm}" |
| metadata = {"source": "DDR", "well": well, "date": date, "type": "activity"} |
| documents.append(Document(page_content=content, metadata=metadata)) |
| |
| |
| if PICKS_CSV.exists(): |
| logger.info(f"Loading Well Picks from {PICKS_CSV}...") |
| df_picks = pd.read_csv(PICKS_CSV) |
| for idx, row in tqdm(df_picks.iterrows(), total=len(df_picks), desc="Picks"): |
| content = row["text"] |
| |
| well_match = re.search(r"Well ([\w\s/-]+),", content) |
| well = well_match.group(1) if well_match else "Unknown" |
| metadata = {"source": "Geophysics", "well": well, "type": "formation_pick"} |
| documents.append(Document(page_content=content, metadata=metadata)) |
|
|
| if not documents: |
| logger.error("No documents found to index.") |
| return |
|
|
| |
| if DB_DIR.exists(): |
| shutil.rmtree(DB_DIR) |
|
|
| |
| logger.info("Initializing HuggingFaceEmbeddings...") |
| embeddings = HuggingFaceEmbeddings( |
| model_name="Octen/Octen-Embedding-0.6B", |
| model_kwargs={'device': 'cuda', 'trust_remote_code': True}, |
| encode_kwargs={'normalize_embeddings': True} |
| ) |
|
|
| |
| logger.info(f"Building combined Vector DB at {DB_DIR} with {len(documents)} docs...") |
| vectorstore = Chroma(persist_directory=str(DB_DIR), embedding_function=embeddings) |
| |
| batch_size = 1000 |
| for i in tqdm(range(0, len(documents), batch_size), desc="Indexing"): |
| vectorstore.add_documents(documents[i:i + batch_size]) |
|
|
| logger.info("✅ Successfully built combined Volve History & Geophysics DB.") |
|
|
| import re |
| if __name__ == "__main__": |
| t0 = time.time() |
| build_combined_db() |
| logger.info(f"Total time: {time.time() - t0:.1f}s") |
|
|