"""
Fetch papers from arXiv for the RAG corpus.
Domains: philosophy of science, scientific methodology, epistemology.
"""

import json
import time
from pathlib import Path

import arxiv
from tqdm import tqdm

SEARCH_QUERIES = [
    # Filosofia e metodologia (relevante para questoes label=0 — baixo score esperado)
    "philosophy of science research methodology",
    "scientific paradigm epistemology",
    "systematic review research question formulation",
    # Ciencia empirica — cobre dominios das questoes label=1
    "CRISPR gene editing molecular mechanisms",
    "transformer neural network scaling language model",
    "hippocampus neurogenesis sleep deprivation",
    "antibiotic resistance plasmid horizontal gene transfer",
    "meta-analysis publication bias statistical methods",
    "gut microbiome immune response vaccination",
    "working memory cognitive training transfer",
    "reinforcement learning reward prediction dopamine",
    "protein folding computational prediction",
    "climate carbon cycle feedback atmospheric",
    "minimum wage employment labor economics",
    "urban heat island precipitation climate",
    "epigenetics inheritance disease transgenerational",
    "convolutional neural network visual representation",
    "social network information diffusion",
    "replication crisis scientific reproducibility",
]

MAX_RESULTS_PER_QUERY = 50


def fetch_corpus(output_dir: Path, max_per_query: int = MAX_RESULTS_PER_QUERY) -> list[dict]:
    output_dir.mkdir(parents=True, exist_ok=True)
    cache_file = output_dir / "papers.jsonl"

    if cache_file.exists():
        papers = [
            json.loads(l) for l in cache_file.read_text(encoding="utf-8").splitlines() if l.strip()
        ]
        print(f"Corpus carregado do cache: {len(papers)} papers")
        return papers

    client = arxiv.Client(page_size=100, delay_seconds=3.0, num_retries=3)
    seen_ids: set[str] = set()
    papers: list[dict] = []

    for query in tqdm(SEARCH_QUERIES, desc="Buscando arXiv"):
        search = arxiv.Search(
            query=query,
            max_results=max_per_query,
            sort_by=arxiv.SortCriterion.Relevance,
        )
        try:
            for result in client.results(search):
                if result.entry_id in seen_ids:
                    continue
                seen_ids.add(result.entry_id)
                papers.append(
                    {
                        "id": result.entry_id,
                        "title": result.title,
                        "abstract": result.summary.replace("\n", " "),
                        "year": result.published.year,
                        "categories": result.categories,
                    }
                )
        except Exception as e:
            print(f"Erro na query '{query}': {e}")
        time.sleep(1.0)

    with cache_file.open("w", encoding="utf-8") as f:
        for p in papers:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"Corpus salvo: {len(papers)} papers em {cache_file}")
    return papers


if __name__ == "__main__":
    from dotenv import load_dotenv

    load_dotenv()
    import os

    corpus_dir = Path(os.getenv("CORPUS_DIR", "data/corpus"))
    papers = fetch_corpus(corpus_dir)
    print(f"Total: {len(papers)} papers")