import os
import sys
from collections import Counter

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from plagdetect.websearch import federated_search

queries = ['"attention is all you need" transformer',
           "neural machine translation attention"]
cands = federated_search(queries, per_query=4,
                         progress=lambda m: print("  >", m))
print("\ntotal unique candidates:", len(cands))
print("by provider:", dict(Counter(c["provider"] for c in cands)))
with_pdf = [c for c in cands if c.get("pdf_url") or c.get("text_url")
            or c.get("fulltext")]
print("candidates with a full-text locator:", len(with_pdf))
merged = [c for c in cands if c.get("oa_via")]
print("candidates whose OA copy was merged from another index:", len(merged))
for c in merged[:5]:
    print(f"   via {c['oa_via']}: {c['title'][:55]}")