File size: 4,002 Bytes
0f77bc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# ingest.py
import os
import glob
import json
from pathlib import Path
from supabase import create_client
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from dotenv import load_dotenv

# load local .env for manual runs (GH Actions will use secrets)
load_dotenv()

# config from env
SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_SERVICE_ROLE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
DOCS_PATH = os.environ.get("DOCS_PATH", "data/docs")  # path in repo for .txt files
JSON_DATASET_PATH = os.environ.get("JSON_DATASET_PATH", "data/sap_dataset.json")

if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
    raise SystemExit(
        "Set SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY in env (local .env or GitHub Secrets) before running."
    )

supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
model = SentenceTransformer(EMBEDDING_MODEL)

def chunk_text(text, chunk_size=1200, overlap=200):
    chunks = []
    start = 0
    text_len = len(text)
    while start < text_len:
        end = min(start + chunk_size, text_len)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        # move with overlap
        start = end - overlap if end - overlap > start else end
    return chunks

def ingest_file(filepath, source="sap-docs-scrape"):
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    title = os.path.basename(filepath)
    chunks = chunk_text(text)
    rows = []
    for ix, chunk in enumerate(chunks):
        emb = model.encode(chunk).tolist()
        row = {
            "source": source,
            "url": None,
            "title": title,
            "content": chunk,
            "chunk_id": ix,
            "embedding": emb
        }
        rows.append(row)
    if rows:
        try:
            res = supabase.table("documents").insert(rows).execute()
            print(f"Inserted {len(rows)} chunks for {filepath}")
        except Exception as e:
            print(f"Insert error for {filepath}: {e}")
    return

def ingest_json_dataset(json_path):
    path = Path(json_path)
    if not path.exists():
        print(f"JSON dataset not found at {json_path}, skipping JSON ingest.")
        return 0
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    total_rows = 0
    for article in tqdm(data, desc="json-articles"):
        content = article.get("content", "")
        if not content:
            continue
        title = article.get("title") or "SAP Article"
        url = article.get("url")
        source = article.get("source", "sap-json")
        chunks = chunk_text(content)
        rows = []
        for ix, chunk in enumerate(chunks):
            emb = model.encode(chunk).tolist()
            rows.append({
                "source": source,
                "url": url,
                "title": title,
                "content": chunk,
                "chunk_id": ix,
                "embedding": emb,
            })
        if rows:
            try:
                res = supabase.table("documents").insert(rows).execute()
                total_rows += len(rows)
            except Exception as e:
                print(f"Insert error for article {title[:60]}: {e}")
    print(f"Inserted {total_rows} chunks from JSON dataset")
    return total_rows

def main():
    total_inserted = 0

    # Prefer JSON dataset if present
    json_rows = ingest_json_dataset(JSON_DATASET_PATH)
    total_inserted += json_rows or 0

    # Also ingest any text files if present
    files = glob.glob(os.path.join(DOCS_PATH, "*.txt"))
    if files:
        print(f"Found {len(files)} txt docs in {DOCS_PATH}")
        for fp in tqdm(files):
            ingest_file(fp)
    else:
        print(f"No txt docs found in {DOCS_PATH}")

    print(f"Ingestion finished. Total chunks inserted: {total_inserted}")

if __name__ == "__main__":
    main()