Spaces:
Running
Running
File size: 4,471 Bytes
3ad9955 ce631c8 3ad9955 ce631c8 3ad9955 ce631c8 3ad9955 ce631c8 3ad9955 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | """Build-time data seeding for HF Spaces deployment.
Downloads ACL Anthology papers, ingests them into SQLite + ChromaDB,
and runs regex-based entity enrichment. Designed to run during Docker build
so the Space starts with data ready to query.
Usage:
python scripts/seed_data.py # All major-venue papers (no limit)
python scripts/seed_data.py --max-papers 500 # Quick demo subset
python scripts/seed_data.py --year-from 2020 --year-to 2024
"""
import argparse
import logging
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.config import get_config
from src.ingestion.chunking import chunk_papers
from src.ingestion.embeddings import EmbeddingGenerator
from src.ingestion.load_acl_anthology import ACLAnthologyLoader
from src.enrichment.pipeline import EnrichmentPipeline
from src.storage.chroma_store import ChromaStore
from src.storage.sqlite_db import SQLiteDB
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
DEMO_VENUES = ["acl", "emnlp", "naacl", "findings", "eacl", "coling"]
def main():
parser = argparse.ArgumentParser(description="Seed ResearchRadar with demo data")
parser.add_argument("--max-papers", type=int, default=None,
help="Max papers to load (default: no limit)")
parser.add_argument("--year-from", type=int, default=None,
help="Earliest year to include")
parser.add_argument("--year-to", type=int, default=None,
help="Latest year to include")
args = parser.parse_args()
config = get_config()
# ββ Step 1: Load papers from ACL Anthology ββββββββββββββββββββββββ
logger.info("=== Step 1: Loading papers from ACL Anthology ===")
loader = ACLAnthologyLoader()
papers = loader.load(
year_from=args.year_from,
year_to=args.year_to,
venues=DEMO_VENUES,
max_papers=args.max_papers,
)
logger.info("Loaded %d papers", len(papers))
if not papers:
logger.error("No papers loaded. Exiting.")
sys.exit(1)
# ββ Step 2: Store in SQLite βββββββββββββββββββββββββββββββββββββββ
logger.info("=== Step 2: Storing in SQLite ===")
db = SQLiteDB(config.sqlite_db_path)
db.create_schema()
db.insert_papers(papers)
logger.info("SQLite: %d papers stored", db.get_paper_count())
# ββ Step 3: Chunk papers ββββββββββββββββββββββββββββββββββββββββββ
logger.info("=== Step 3: Chunking papers ===")
chunks = chunk_papers(papers, strategy="abstract")
db.insert_chunks(chunks)
logger.info("SQLite: %d chunks stored", db.get_chunk_count())
# ββ Step 4: Generate embeddings βββββββββββββββββββββββββββββββββββ
logger.info("=== Step 4: Generating embeddings ===")
chroma = ChromaStore(config.chroma_db_path)
embedder = EmbeddingGenerator(config.embedding_model)
chunks_with_meta = db.get_all_chunks()
embedder.embed_and_store(chunks_with_meta, chroma)
logger.info("ChromaDB: %d embeddings stored", chroma.count())
# ββ Step 5: Regex-based entity enrichment βββββββββββββββββββββββββ
logger.info("=== Step 5: Enriching with regex extraction ===")
enricher = EnrichmentPipeline(
db=db,
llm_backend=None, # No LLM at build time β regex only
use_regex_fallback=True,
)
stats = enricher.enrich_all()
logger.info(
"Enrichment: %d methods, %d datasets, %d tasks, %d topics",
stats.total_methods,
stats.total_datasets,
stats.total_tasks,
stats.total_topics,
)
# ββ Summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
logger.info("=== Seeding complete ===")
logger.info(
"Papers: %d | Chunks: %d | Embeddings: %d",
db.get_paper_count(),
db.get_chunk_count(),
chroma.count(),
)
if __name__ == "__main__":
main()
|