Spaces:

fmr34
/

reformulatee

Running

fmrod

deploy: docs atualizadas

c31002d 7 days ago

3.31 kB

	"""
	Fetch papers from arXiv for the RAG corpus.
	Domains: philosophy of science, scientific methodology, epistemology.
	"""

	import json
	import time
	from pathlib import Path

	import arxiv
	from tqdm import tqdm

	SEARCH_QUERIES = [
	# Filosofia e metodologia (relevante para questoes label=0 — baixo score esperado)
	"philosophy of science research methodology",
	"scientific paradigm epistemology",
	"systematic review research question formulation",
	# Ciencia empirica — cobre dominios das questoes label=1
	"CRISPR gene editing molecular mechanisms",
	"transformer neural network scaling language model",
	"hippocampus neurogenesis sleep deprivation",
	"antibiotic resistance plasmid horizontal gene transfer",
	"meta-analysis publication bias statistical methods",
	"gut microbiome immune response vaccination",
	"working memory cognitive training transfer",
	"reinforcement learning reward prediction dopamine",
	"protein folding computational prediction",
	"climate carbon cycle feedback atmospheric",
	"minimum wage employment labor economics",
	"urban heat island precipitation climate",
	"epigenetics inheritance disease transgenerational",
	"convolutional neural network visual representation",
	"social network information diffusion",
	"replication crisis scientific reproducibility",
	]

	MAX_RESULTS_PER_QUERY = 50


	def fetch_corpus(output_dir: Path, max_per_query: int = MAX_RESULTS_PER_QUERY) -> list[dict]:
	output_dir.mkdir(parents=True, exist_ok=True)
	cache_file = output_dir / "papers.jsonl"

	if cache_file.exists():
	papers = [
	json.loads(l) for l in cache_file.read_text(encoding="utf-8").splitlines() if l.strip()
	]
	print(f"Corpus carregado do cache: {len(papers)} papers")
	return papers

	client = arxiv.Client(page_size=100, delay_seconds=3.0, num_retries=3)
	seen_ids: set[str] = set()
	papers: list[dict] = []

	for query in tqdm(SEARCH_QUERIES, desc="Buscando arXiv"):
	search = arxiv.Search(
	query=query,
	max_results=max_per_query,
	sort_by=arxiv.SortCriterion.Relevance,
	)
	try:
	for result in client.results(search):
	if result.entry_id in seen_ids:
	continue
	seen_ids.add(result.entry_id)
	papers.append(
	{
	"id": result.entry_id,
	"title": result.title,
	"abstract": result.summary.replace("\n", " "),
	"year": result.published.year,
	"categories": result.categories,
	}
	)
	except Exception as e:
	print(f"Erro na query '{query}': {e}")
	time.sleep(1.0)

	with cache_file.open("w", encoding="utf-8") as f:
	for p in papers:
	f.write(json.dumps(p, ensure_ascii=False) + "\n")

	print(f"Corpus salvo: {len(papers)} papers em {cache_file}")
	return papers


	if __name__ == "__main__":
	from dotenv import load_dotenv

	load_dotenv()
	import os

	corpus_dir = Path(os.getenv("CORPUS_DIR", "data/corpus"))
	papers = fetch_corpus(corpus_dir)
	print(f"Total: {len(papers)} papers")