Spaces:
Sleeping
Sleeping
| # ingest.py | |
| import os | |
| import glob | |
| import json | |
| from pathlib import Path | |
| from supabase import create_client | |
| from sentence_transformers import SentenceTransformer | |
| from tqdm import tqdm | |
| from dotenv import load_dotenv | |
| # load local .env for manual runs (GH Actions will use secrets) | |
| load_dotenv() | |
| # config from env | |
| SUPABASE_URL = os.environ.get("SUPABASE_URL") | |
| SUPABASE_SERVICE_ROLE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") | |
| EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "all-MiniLM-L6-v2") | |
| DOCS_PATH = os.environ.get("DOCS_PATH", "data/docs") # path in repo for .txt files | |
| JSON_DATASET_PATH = os.environ.get("JSON_DATASET_PATH", "data/sap_dataset.json") | |
| if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY: | |
| raise SystemExit( | |
| "Set SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY in env (local .env or GitHub Secrets) before running." | |
| ) | |
| supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY) | |
| model = SentenceTransformer(EMBEDDING_MODEL) | |
| def chunk_text(text, chunk_size=1200, overlap=200): | |
| chunks = [] | |
| start = 0 | |
| text_len = len(text) | |
| while start < text_len: | |
| end = min(start + chunk_size, text_len) | |
| chunk = text[start:end].strip() | |
| if chunk: | |
| chunks.append(chunk) | |
| # move with overlap | |
| start = end - overlap if end - overlap > start else end | |
| return chunks | |
| def ingest_file(filepath, source="sap-docs-scrape"): | |
| with open(filepath, "r", encoding="utf-8", errors="ignore") as f: | |
| text = f.read() | |
| title = os.path.basename(filepath) | |
| chunks = chunk_text(text) | |
| rows = [] | |
| for ix, chunk in enumerate(chunks): | |
| emb = model.encode(chunk).tolist() | |
| row = { | |
| "source": source, | |
| "url": None, | |
| "title": title, | |
| "content": chunk, | |
| "chunk_id": ix, | |
| "embedding": emb | |
| } | |
| rows.append(row) | |
| if rows: | |
| try: | |
| res = supabase.table("documents").insert(rows).execute() | |
| print(f"Inserted {len(rows)} chunks for {filepath}") | |
| except Exception as e: | |
| print(f"Insert error for {filepath}: {e}") | |
| return | |
| def ingest_json_dataset(json_path): | |
| path = Path(json_path) | |
| if not path.exists(): | |
| print(f"JSON dataset not found at {json_path}, skipping JSON ingest.") | |
| return 0 | |
| with path.open("r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| total_rows = 0 | |
| for article in tqdm(data, desc="json-articles"): | |
| content = article.get("content", "") | |
| if not content: | |
| continue | |
| title = article.get("title") or "SAP Article" | |
| url = article.get("url") | |
| source = article.get("source", "sap-json") | |
| chunks = chunk_text(content) | |
| rows = [] | |
| for ix, chunk in enumerate(chunks): | |
| emb = model.encode(chunk).tolist() | |
| rows.append({ | |
| "source": source, | |
| "url": url, | |
| "title": title, | |
| "content": chunk, | |
| "chunk_id": ix, | |
| "embedding": emb, | |
| }) | |
| if rows: | |
| try: | |
| res = supabase.table("documents").insert(rows).execute() | |
| total_rows += len(rows) | |
| except Exception as e: | |
| print(f"Insert error for article {title[:60]}: {e}") | |
| print(f"Inserted {total_rows} chunks from JSON dataset") | |
| return total_rows | |
| def main(): | |
| total_inserted = 0 | |
| # Prefer JSON dataset if present | |
| json_rows = ingest_json_dataset(JSON_DATASET_PATH) | |
| total_inserted += json_rows or 0 | |
| # Also ingest any text files if present | |
| files = glob.glob(os.path.join(DOCS_PATH, "*.txt")) | |
| if files: | |
| print(f"Found {len(files)} txt docs in {DOCS_PATH}") | |
| for fp in tqdm(files): | |
| ingest_file(fp) | |
| else: | |
| print(f"No txt docs found in {DOCS_PATH}") | |
| print(f"Ingestion finished. Total chunks inserted: {total_inserted}") | |
| if __name__ == "__main__": | |
| main() | |