Spaces:
Running
Running
| """Build-time data seeding for HF Spaces deployment. | |
| Downloads ACL Anthology papers, ingests them into SQLite + ChromaDB, | |
| and runs regex-based entity enrichment. Designed to run during Docker build | |
| so the Space starts with data ready to query. | |
| Usage: | |
| python scripts/seed_data.py # All major-venue papers (no limit) | |
| python scripts/seed_data.py --max-papers 500 # Quick demo subset | |
| python scripts/seed_data.py --year-from 2020 --year-to 2024 | |
| """ | |
| import argparse | |
| import logging | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from src.config import get_config | |
| from src.ingestion.chunking import chunk_papers | |
| from src.ingestion.embeddings import EmbeddingGenerator | |
| from src.ingestion.load_acl_anthology import ACLAnthologyLoader | |
| from src.enrichment.pipeline import EnrichmentPipeline | |
| from src.storage.chroma_store import ChromaStore | |
| from src.storage.sqlite_db import SQLiteDB | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| DEMO_VENUES = ["acl", "emnlp", "naacl", "findings", "eacl", "coling"] | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Seed ResearchRadar with demo data") | |
| parser.add_argument("--max-papers", type=int, default=None, | |
| help="Max papers to load (default: no limit)") | |
| parser.add_argument("--year-from", type=int, default=None, | |
| help="Earliest year to include") | |
| parser.add_argument("--year-to", type=int, default=None, | |
| help="Latest year to include") | |
| args = parser.parse_args() | |
| config = get_config() | |
| # ββ Step 1: Load papers from ACL Anthology ββββββββββββββββββββββββ | |
| logger.info("=== Step 1: Loading papers from ACL Anthology ===") | |
| loader = ACLAnthologyLoader() | |
| papers = loader.load( | |
| year_from=args.year_from, | |
| year_to=args.year_to, | |
| venues=DEMO_VENUES, | |
| max_papers=args.max_papers, | |
| ) | |
| logger.info("Loaded %d papers", len(papers)) | |
| if not papers: | |
| logger.error("No papers loaded. Exiting.") | |
| sys.exit(1) | |
| # ββ Step 2: Store in SQLite βββββββββββββββββββββββββββββββββββββββ | |
| logger.info("=== Step 2: Storing in SQLite ===") | |
| db = SQLiteDB(config.sqlite_db_path) | |
| db.create_schema() | |
| db.insert_papers(papers) | |
| logger.info("SQLite: %d papers stored", db.get_paper_count()) | |
| # ββ Step 3: Chunk papers ββββββββββββββββββββββββββββββββββββββββββ | |
| logger.info("=== Step 3: Chunking papers ===") | |
| chunks = chunk_papers(papers, strategy="abstract") | |
| db.insert_chunks(chunks) | |
| logger.info("SQLite: %d chunks stored", db.get_chunk_count()) | |
| # ββ Step 4: Generate embeddings βββββββββββββββββββββββββββββββββββ | |
| logger.info("=== Step 4: Generating embeddings ===") | |
| chroma = ChromaStore(config.chroma_db_path) | |
| embedder = EmbeddingGenerator(config.embedding_model) | |
| chunks_with_meta = db.get_all_chunks() | |
| embedder.embed_and_store(chunks_with_meta, chroma) | |
| logger.info("ChromaDB: %d embeddings stored", chroma.count()) | |
| # ββ Step 5: Regex-based entity enrichment βββββββββββββββββββββββββ | |
| logger.info("=== Step 5: Enriching with regex extraction ===") | |
| enricher = EnrichmentPipeline( | |
| db=db, | |
| llm_backend=None, # No LLM at build time β regex only | |
| use_regex_fallback=True, | |
| ) | |
| stats = enricher.enrich_all() | |
| logger.info( | |
| "Enrichment: %d methods, %d datasets, %d tasks, %d topics", | |
| stats.total_methods, | |
| stats.total_datasets, | |
| stats.total_tasks, | |
| stats.total_topics, | |
| ) | |
| # ββ Summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| logger.info("=== Seeding complete ===") | |
| logger.info( | |
| "Papers: %d | Chunks: %d | Embeddings: %d", | |
| db.get_paper_count(), | |
| db.get_chunk_count(), | |
| chroma.count(), | |
| ) | |
| if __name__ == "__main__": | |
| main() | |