sap-chatbot / ingest.py
github-actions[bot]
Deploy from GitHub Actions 2025-12-11_00:05:39
0f77bc1
# ingest.py
import os
import glob
import json
from pathlib import Path
from supabase import create_client
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from dotenv import load_dotenv
# load local .env for manual runs (GH Actions will use secrets)
load_dotenv()
# config from env
SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_SERVICE_ROLE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
DOCS_PATH = os.environ.get("DOCS_PATH", "data/docs") # path in repo for .txt files
JSON_DATASET_PATH = os.environ.get("JSON_DATASET_PATH", "data/sap_dataset.json")
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
raise SystemExit(
"Set SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY in env (local .env or GitHub Secrets) before running."
)
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
model = SentenceTransformer(EMBEDDING_MODEL)
def chunk_text(text, chunk_size=1200, overlap=200):
chunks = []
start = 0
text_len = len(text)
while start < text_len:
end = min(start + chunk_size, text_len)
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
# move with overlap
start = end - overlap if end - overlap > start else end
return chunks
def ingest_file(filepath, source="sap-docs-scrape"):
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
title = os.path.basename(filepath)
chunks = chunk_text(text)
rows = []
for ix, chunk in enumerate(chunks):
emb = model.encode(chunk).tolist()
row = {
"source": source,
"url": None,
"title": title,
"content": chunk,
"chunk_id": ix,
"embedding": emb
}
rows.append(row)
if rows:
try:
res = supabase.table("documents").insert(rows).execute()
print(f"Inserted {len(rows)} chunks for {filepath}")
except Exception as e:
print(f"Insert error for {filepath}: {e}")
return
def ingest_json_dataset(json_path):
path = Path(json_path)
if not path.exists():
print(f"JSON dataset not found at {json_path}, skipping JSON ingest.")
return 0
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
total_rows = 0
for article in tqdm(data, desc="json-articles"):
content = article.get("content", "")
if not content:
continue
title = article.get("title") or "SAP Article"
url = article.get("url")
source = article.get("source", "sap-json")
chunks = chunk_text(content)
rows = []
for ix, chunk in enumerate(chunks):
emb = model.encode(chunk).tolist()
rows.append({
"source": source,
"url": url,
"title": title,
"content": chunk,
"chunk_id": ix,
"embedding": emb,
})
if rows:
try:
res = supabase.table("documents").insert(rows).execute()
total_rows += len(rows)
except Exception as e:
print(f"Insert error for article {title[:60]}: {e}")
print(f"Inserted {total_rows} chunks from JSON dataset")
return total_rows
def main():
total_inserted = 0
# Prefer JSON dataset if present
json_rows = ingest_json_dataset(JSON_DATASET_PATH)
total_inserted += json_rows or 0
# Also ingest any text files if present
files = glob.glob(os.path.join(DOCS_PATH, "*.txt"))
if files:
print(f"Found {len(files)} txt docs in {DOCS_PATH}")
for fp in tqdm(files):
ingest_file(fp)
else:
print(f"No txt docs found in {DOCS_PATH}")
print(f"Ingestion finished. Total chunks inserted: {total_inserted}")
if __name__ == "__main__":
main()