""" Fetch papers from arXiv for the RAG corpus. Domains: philosophy of science, scientific methodology, epistemology. """ import json import time from pathlib import Path import arxiv from tqdm import tqdm SEARCH_QUERIES = [ # Filosofia e metodologia (relevante para questoes label=0 — baixo score esperado) "philosophy of science research methodology", "scientific paradigm epistemology", "systematic review research question formulation", # Ciencia empirica — cobre dominios das questoes label=1 "CRISPR gene editing molecular mechanisms", "transformer neural network scaling language model", "hippocampus neurogenesis sleep deprivation", "antibiotic resistance plasmid horizontal gene transfer", "meta-analysis publication bias statistical methods", "gut microbiome immune response vaccination", "working memory cognitive training transfer", "reinforcement learning reward prediction dopamine", "protein folding computational prediction", "climate carbon cycle feedback atmospheric", "minimum wage employment labor economics", "urban heat island precipitation climate", "epigenetics inheritance disease transgenerational", "convolutional neural network visual representation", "social network information diffusion", "replication crisis scientific reproducibility", ] MAX_RESULTS_PER_QUERY = 50 def fetch_corpus(output_dir: Path, max_per_query: int = MAX_RESULTS_PER_QUERY) -> list[dict]: output_dir.mkdir(parents=True, exist_ok=True) cache_file = output_dir / "papers.jsonl" if cache_file.exists(): papers = [ json.loads(l) for l in cache_file.read_text(encoding="utf-8").splitlines() if l.strip() ] print(f"Corpus carregado do cache: {len(papers)} papers") return papers client = arxiv.Client(page_size=100, delay_seconds=3.0, num_retries=3) seen_ids: set[str] = set() papers: list[dict] = [] for query in tqdm(SEARCH_QUERIES, desc="Buscando arXiv"): search = arxiv.Search( query=query, max_results=max_per_query, sort_by=arxiv.SortCriterion.Relevance, ) try: for result in client.results(search): if result.entry_id in seen_ids: continue seen_ids.add(result.entry_id) papers.append( { "id": result.entry_id, "title": result.title, "abstract": result.summary.replace("\n", " "), "year": result.published.year, "categories": result.categories, } ) except Exception as e: print(f"Erro na query '{query}': {e}") time.sleep(1.0) with cache_file.open("w", encoding="utf-8") as f: for p in papers: f.write(json.dumps(p, ensure_ascii=False) + "\n") print(f"Corpus salvo: {len(papers)} papers em {cache_file}") return papers if __name__ == "__main__": from dotenv import load_dotenv load_dotenv() import os corpus_dir = Path(os.getenv("CORPUS_DIR", "data/corpus")) papers = fetch_corpus(corpus_dir) print(f"Total: {len(papers)} papers")