Spaces:

Psytamaa
/

sap-chatbot

Sleeping

File size: 4,002 Bytes

0f77bc1

# ingest.py
import os
import glob
import json
from pathlib import Path
from supabase import create_client
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from dotenv import load_dotenv

# load local .env for manual runs (GH Actions will use secrets)
load_dotenv()

# config from env
SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_SERVICE_ROLE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
DOCS_PATH = os.environ.get("DOCS_PATH", "data/docs")  # path in repo for .txt files
JSON_DATASET_PATH = os.environ.get("JSON_DATASET_PATH", "data/sap_dataset.json")

if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
    raise SystemExit(
        "Set SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY in env (local .env or GitHub Secrets) before running."
    )

supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
model = SentenceTransformer(EMBEDDING_MODEL)

def chunk_text(text, chunk_size=1200, overlap=200):
    chunks = []
    start = 0
    text_len = len(text)
    while start < text_len:
        end = min(start + chunk_size, text_len)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        # move with overlap
        start = end - overlap if end - overlap > start else end
    return chunks

def ingest_file(filepath, source="sap-docs-scrape"):
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    title = os.path.basename(filepath)
    chunks = chunk_text(text)
    rows = []
    for ix, chunk in enumerate(chunks):
        emb = model.encode(chunk).tolist()
        row = {
            "source": source,
            "url": None,
            "title": title,
            "content": chunk,
            "chunk_id": ix,
            "embedding": emb
        }
        rows.append(row)
    if rows:
        try:
            res = supabase.table("documents").insert(rows).execute()
            print(f"Inserted {len(rows)} chunks for {filepath}")
        except Exception as e:
            print(f"Insert error for {filepath}: {e}")
    return

def ingest_json_dataset(json_path):
    path = Path(json_path)
    if not path.exists():
        print(f"JSON dataset not found at {json_path}, skipping JSON ingest.")
        return 0
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    total_rows = 0
    for article in tqdm(data, desc="json-articles"):
        content = article.get("content", "")
        if not content:
            continue
        title = article.get("title") or "SAP Article"
        url = article.get("url")
        source = article.get("source", "sap-json")
        chunks = chunk_text(content)
        rows = []
        for ix, chunk in enumerate(chunks):
            emb = model.encode(chunk).tolist()
            rows.append({
                "source": source,
                "url": url,
                "title": title,
                "content": chunk,
                "chunk_id": ix,
                "embedding": emb,
            })
        if rows:
            try:
                res = supabase.table("documents").insert(rows).execute()
                total_rows += len(rows)
            except Exception as e:
                print(f"Insert error for article {title[:60]}: {e}")
    print(f"Inserted {total_rows} chunks from JSON dataset")
    return total_rows

def main():
    total_inserted = 0

    # Prefer JSON dataset if present
    json_rows = ingest_json_dataset(JSON_DATASET_PATH)
    total_inserted += json_rows or 0

    # Also ingest any text files if present
    files = glob.glob(os.path.join(DOCS_PATH, "*.txt"))
    if files:
        print(f"Found {len(files)} txt docs in {DOCS_PATH}")
        for fp in tqdm(files):
            ingest_file(fp)
    else:
        print(f"No txt docs found in {DOCS_PATH}")

    print(f"Ingestion finished. Total chunks inserted: {total_inserted}")

if __name__ == "__main__":
    main()