File size: 3,469 Bytes
67e93c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
build_volve_db.py
-----------------
Builds a combined Volve History & Geophysics Vector DB.
Includes:
1. Structured DDR Activity Narratives
2. Geological Formation Picks (Geophysical Interpretations)
"""

import os
import time
import shutil
import logging
import pandas as pd
from pathlib import Path
from tqdm import tqdm

from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

BASE_DIR = Path(__file__).resolve().parents[2]
DATA_DIR = BASE_DIR / "data" / "processed"
DB_DIR = BASE_DIR / "data" / "knowledge_base" / "volve_ddr_history"
DDR_CSV = DATA_DIR / "ddr" / "_ddr_all_activities.csv"
PICKS_CSV = DATA_DIR / "serialized_text" / "well_picks_narratives.csv"

def build_combined_db():
    documents = []

    # 1. Ingest DDR Activities
    if DDR_CSV.exists():
        logger.info(f"Loading DDR activities from {DDR_CSV}...")
        df_ddr = pd.read_csv(DDR_CSV).fillna("")
        for idx, row in tqdm(df_ddr.iterrows(), total=len(df_ddr), desc="DDR"):
            well = str(row.get("well_name", ""))
            date = str(row.get("act_start", ""))[:10]
            comm = str(row.get("comments", "")).strip()
            state = str(row.get("state", ""))
            if not comm and state == "ok": continue
            
            content = f"Date: {date}\nWell: {well}\nActivity: {row.get('activity_code','')}\nDepth: {row.get('md_m','')}m\nComments: {comm}"
            metadata = {"source": "DDR", "well": well, "date": date, "type": "activity"}
            documents.append(Document(page_content=content, metadata=metadata))
    
    # 2. Ingest Well Picks (Geophysics)
    if PICKS_CSV.exists():
        logger.info(f"Loading Well Picks from {PICKS_CSV}...")
        df_picks = pd.read_csv(PICKS_CSV)
        for idx, row in tqdm(df_picks.iterrows(), total=len(df_picks), desc="Picks"):
            content = row["text"]
            # Extract well name from narrative for metadata if possible
            well_match = re.search(r"Well ([\w\s/-]+),", content)
            well = well_match.group(1) if well_match else "Unknown"
            metadata = {"source": "Geophysics", "well": well, "type": "formation_pick"}
            documents.append(Document(page_content=content, metadata=metadata))

    if not documents:
        logger.error("No documents found to index.")
        return

    # Clear existing
    if DB_DIR.exists():
        shutil.rmtree(DB_DIR)

    # Embeddings
    logger.info("Initializing HuggingFaceEmbeddings...")
    embeddings = HuggingFaceEmbeddings(
        model_name="Octen/Octen-Embedding-0.6B",
        model_kwargs={'device': 'cuda', 'trust_remote_code': True},
        encode_kwargs={'normalize_embeddings': True}
    )

    # Vector Store
    logger.info(f"Building combined Vector DB at {DB_DIR} with {len(documents)} docs...")
    vectorstore = Chroma(persist_directory=str(DB_DIR), embedding_function=embeddings)
    
    batch_size = 1000
    for i in tqdm(range(0, len(documents), batch_size), desc="Indexing"):
        vectorstore.add_documents(documents[i:i + batch_size])

    logger.info("✅ Successfully built combined Volve History & Geophysics DB.")

import re
if __name__ == "__main__":
    t0 = time.time()
    build_combined_db()
    logger.info(f"Total time: {time.time() - t0:.1f}s")