Spaces:

kaurm43
/

PolyFusionAgent

Running

File size: 47,414 Bytes

from __future__ import annotations

import os
import re
import time
import json
import hashlib
import pathlib
import tempfile
from typing import List, Optional, Dict, Any, Union
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict

import requests
from tqdm import tqdm

# --------------------------------------------------------------------------------------
# Vector store, loaders, splitters
# --------------------------------------------------------------------------------------
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

# --------------------------------------------------------------------------------------
# OpenAI embeddings
# --------------------------------------------------------------------------------------
from langchain_openai import OpenAIEmbeddings

# --------------------------------------------------------------------------------------
# Tokenizer for true token-based multi-scale segmentation
# --------------------------------------------------------------------------------------
import tiktoken


def sanitize_text(text: str) -> str:
    """
    Remove surrogate pairs and invalid Unicode characters.
    Prevents UnicodeEncodeError when adding documents to ChromaDB.
    """
    if not text:
        return text
    # Replace surrogates and invalid chars with empty string
    return text.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")


# --------------------------------------------------------------------------------------
# ARXIV, OPENALEX, EPMC API URLS
# --------------------------------------------------------------------------------------
ARXIV_SEARCH_URL = "http://export.arxiv.org/api/query"
OPENALEX_WORKS_URL = "https://api.openalex.org/works"
EPMC_SEARCH_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"

DEFAULT_PERSIST_DIR = "chroma_polymer_db"
DEFAULT_TMP_DOWNLOAD_DIR = os.path.join(tempfile.gettempdir(), "polymer_rag_pdfs")
MANIFEST_NAME = "manifest.jsonl"

# --------------------------------------------------------------------------------------
# Balanced target distribution
# --------------------------------------------------------------------------------------
TARGET_CURATED = 100
TARGET_JOURNALS = 200
TARGET_ARXIV = 800
TARGET_OPENALEX = 600
TARGET_EPMC = 200
TARGET_DATABASES = 100

# --------------------------------------------------------------------------------------
# Polymer keywords
# --------------------------------------------------------------------------------------
POLYMER_KEYWORDS = [
    "polymer",
    "macromolecule",
    "macromolecular",
    "polymeric",
    "polymer informatics",
    "polymer chemistry",
    "polymer physics",
    "PSMILES",
    "pSMILES",
    "BigSMILES",
    "polymer SMILES",
    "polymer sequence",
    "polymer electrolyte",
    "polymer morphology",
    "polymer dielectric",
    "polymer electrolyte membrane",
    "block copolymer",
    "biopolymer",
    "polymer nanocomposite",
    "polymer foundation model",
    "self-supervised polymer",
    "masked language model polymer",
    "polymer transformer",
    "generative polymer",
    "copolymer",
    "polymerization",
    "polymer synthesis",
    "polymer characterization",
]

# --------------------------------------------------------------------------------------
# IUPAC Guidelines & Standards (polymer nomenclature and terminology standards)
# --------------------------------------------------------------------------------------
CURATED_IUPAC_STANDARDS: List[Dict[str, Any]] = [
    {
        "url": "https://iupac.org/wp-content/uploads/2019/07/140-Brief-Guide-to-Polymer-Nomenclature-Web-Final-d.pdf",
        "name": "IUPAC - Brief Guide to Polymer Nomenclature",
        "meta": {
            "title": "A Brief Guide to Polymer Nomenclature (IUPAC Technical Report)",
            "year": "2012",
            "venue": "IUPAC Pure and Applied Chemistry",
            "source": "curated_iupac_standard",
        },
    },
    {
        "url": "https://rseq.org/wp-content/uploads/2022/10/20220816-English-BriefGuidePolymerTerminology-IUPAC.pdf",
        "name": "IUPAC - Brief Guide to Polymerization Terminology",
        "meta": {
            "title": "A Brief Guide to Polymerization Terminology (IUPAC Recommendations)",
            "year": "2022",
            "venue": "IUPAC",
            "source": "curated_iupac_standard",
        },
    },
    {
        "url": "https://www.rsc.org/images/richard-jones-naming-polymers_tcm18-243646.pdf",
        "name": "RSC - Naming Polymers",
        "meta": {
            "title": "Naming Polymers (RSC Educational Resource)",
            "year": "2020",
            "venue": "Royal Society of Chemistry",
            "source": "curated_iupac_standard",
        },
    },
]

# --------------------------------------------------------------------------------------
# ISO/ASTM Standards (polymer testing and characterization standards)
# --------------------------------------------------------------------------------------
CURATED_ISO_ASTM_STANDARDS: List[Dict[str, Any]] = [
    {
        "url": "https://cdn.standards.iteh.ai/samples/76910/29c8e7af07bd4188b297c39684ada79e/ISO-ASTM-52925-2022.pdf",
        "name": "ISO/ASTM 52925:2022 - Additive Manufacturing Polymers",
        "meta": {
            "title": "ISO/ASTM 52925:2022 Additive manufacturing of polymers - Feedstock materials",
            "year": "2022",
            "venue": "ISO/ASTM",
            "source": "curated_iso_astm_standard",
        },
    },
    {
        "url": "https://cdn.standards.iteh.ai/samples/76909/b9883b2f204248aca175e2f574bd879c/ISO-ASTM-52924-2023.pdf",
        "name": "ISO/ASTM 52924:2023 - Additive Manufacturing Qualification",
        "meta": {
            "title": "ISO/ASTM 52924:2023 Additive manufacturing of polymers - Qualification principles",
            "year": "2023",
            "venue": "ISO/ASTM",
            "source": "curated_iso_astm_standard",
        },
    },
    {
        "url": "https://nvlpubs.nist.gov/nistpubs/ir/2015/NIST.IR.8059.pdf",
        "name": "NIST IR 8059 - Materials Testing Standards for Additive Manufacturing",
        "meta": {
            "title": "Materials Testing Standards for Additive Manufacturing of Polymer Materials",
            "year": "2015",
            "venue": "NIST",
            "source": "curated_iso_astm_standard",
        },
    },
]

# --------------------------------------------------------------------------------------
# Foundational polymer informatics papers
# --------------------------------------------------------------------------------------
CURATED_POLYMER_INFORMATICS: List[Dict[str, Any]] = [
    {
        "url": "https://ramprasad.mse.gatech.edu/wp-content/uploads/2021/01/polymer-informatics.pdf",
        "name": "Polymer Informatics - Current Status and Critical Next Steps",
        "meta": {
            "title": "Polymer informatics: Current status and critical next steps",
            "year": "2020",
            "venue": "Materials Science and Engineering: R",
            "source": "curated_review_informatics",
        },
    },
    {
        "url": "https://arxiv.org/pdf/2011.00508.pdf",
        "name": "Polymer Informatics - Current Status (arXiv)",
        "meta": {
            "title": "Polymer Informatics: Current Status and Critical Next Steps",
            "year": "2020",
            "venue": "arXiv:2011.00508",
            "source": "curated_review_informatics",
        },
    },
]

# --------------------------------------------------------------------------------------
# BigSMILES notation papers (polymer representation standards)
# --------------------------------------------------------------------------------------
CURATED_BIGSMILES: List[Dict[str, Any]] = [
    {
        "url": "https://pubs.acs.org/doi/pdf/10.1021/acscentsci.9b00476",
        "name": "BigSMILES - Structurally-Based Line Notation",
        "meta": {
            "title": "BigSMILES: A Structurally-Based Line Notation for Describing Macromolecules",
            "year": "2019",
            "venue": "ACS Central Science",
            "source": "curated_bigsmiles",
        },
    },
    {
        "url": "https://www.rsc.org/suppdata/d3/dd/d3dd00147d/d3dd00147d1.pdf",
        "name": "Generative BigSMILES - Supplementary Information",
        "meta": {
            "title": "Generative BigSMILES: an extension for polymer informatics (SI)",
            "year": "2024",
            "venue": "RSC Digital Discovery",
            "source": "curated_bigsmiles",
        },
    },
]

# --------------------------------------------------------------------------------------
# Combine all curated sources
# --------------------------------------------------------------------------------------
CURATED_POLYMER_PDF_SOURCES = (
    CURATED_IUPAC_STANDARDS
    + CURATED_ISO_ASTM_STANDARDS
    + CURATED_POLYMER_INFORMATICS
    + CURATED_BIGSMILES
)

# --------------------------------------------------------------------------------------
# Major polymer journals with OA content
# --------------------------------------------------------------------------------------
POLYMER_JOURNAL_QUERIES = [
    # ACS Journals
    {"journal": "Macromolecules", "issn": "0024-9297", "publisher": "ACS"},
    {"journal": "ACS Polymers Au", "issn": "2768-1939", "publisher": "ACS"},
    {"journal": "ACS Applied Polymer Materials", "issn": "2637-6105", "publisher": "ACS"},
    {"journal": "Biomacromolecules", "issn": "1525-7797", "publisher": "ACS"},
    {"journal": "ACS Macro Letters", "issn": "2161-1653", "publisher": "ACS"},
    # RSC Journals
    {"journal": "Polymer Chemistry", "issn": "1759-9954", "publisher": "RSC"},
    {"journal": "RSC Applied Polymers", "issn": "2755-0656", "publisher": "RSC"},
    {"journal": "Soft Matter", "issn": "1744-683X", "publisher": "RSC"},
    # Springer/Nature Journals
    {"journal": "Polymer Journal", "issn": "0032-3896", "publisher": "Nature"},
    {"journal": "Journal of Polymer Science", "issn": "2642-4169", "publisher": "Wiley"},
    # Additional OA Journals
    {"journal": "Polymer Science and Technology", "issn": "2837-0341", "publisher": "ACS"},
    {"journal": "Polymers", "issn": "2073-4360", "publisher": "MDPI"},
]

DEFAULT_MAILTO = "kaur-m43@webmail.uwinnipeg.ca"  # polite defaults


# --------------------------------------------------------------------------------------
# DEDUPLICATION, DOWNLOAD, MANIFEST HELPERS
# --------------------------------------------------------------------------------------
def sha256_bytes(data: bytes) -> str:
    return hashlib.sha256(data).hexdigest()


def safe_filename(name: str) -> str:
    name = str(name or "").strip().replace("/", "_").replace("\\", "_")
    name = re.sub(r"[^a-zA-Z0-9._\-]", "_", name)
    return name[:200]


def is_probably_pdf(raw: bytes, content_type: str) -> bool:
    if not raw:
        return False
    if raw[:4] == b"%PDF":
        return True
    return "pdf" in (content_type or "").lower()


def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def append_manifest(out_dir: str, record: Dict[str, Any]) -> None:
    try:
        ensure_dir(out_dir)
        with open(os.path.join(out_dir, MANIFEST_NAME), "a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
    except Exception:
        pass


def load_manifest(out_dir: str) -> Dict[str, Dict[str, Any]]:
    data: Dict[str, Dict[str, Any]] = {}
    try:
        mpath = os.path.join(out_dir, MANIFEST_NAME)
        if not os.path.exists(mpath):
            return data
        with open(mpath, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    rec = json.loads(line)
                    p = rec.get("path")
                    sha = rec.get("sha256")
                    if p:
                        data[p] = rec
                    if sha:
                        data[sha] = rec
                except Exception:
                    continue
    except Exception:
        pass
    return data


# --------------------------------------------------------------------------------------
# DOWNLOAD SINGLE PDF
# --------------------------------------------------------------------------------------
def download_pdf(
    url: str,
    out_dir: str,
    suggested_name: Optional[str] = None,
    timeout: int = 60,
    meta: Optional[Dict[str, Any]] = None,
    manifest: Optional[Dict[str, Dict[str, Any]]] = None,
) -> Optional[str]:
    """
    Download a PDF and return local file path, or None on failure.
    Deduplicates by SHA256 content hash.
    Writes manifest record if meta provided.
    """
    try:
        headers = {"User-Agent": f"polymer-rag/1.0 ({DEFAULT_MAILTO})"}
        with requests.get(
            url, headers=headers, timeout=timeout, stream=True, allow_redirects=True
        ) as r:
            r.raise_for_status()
            content_type = r.headers.get("Content-Type", "")
            raw = r.content
            if not raw or not is_probably_pdf(raw, content_type):
                return None

            sha = sha256_bytes(raw)
            ensure_dir(out_dir)

            # Check manifest for existing SHA
            if manifest and sha in manifest:
                existing_path = manifest[sha].get("path")
                if existing_path and os.path.exists(existing_path):
                    return existing_path

            # Check filesystem for existing files with this hash
            existing = list(pathlib.Path(out_dir).glob(f"{sha[:16]}*.pdf"))
            if existing:
                path = str(existing[0])
                if meta:
                    rec = dict(meta)
                    rec.update({"sha256": sha, "path": path})
                    append_manifest(out_dir, rec)
                return path

            base = suggested_name or pathlib.Path(url).name or "paper.pdf"
            base = safe_filename(base)
            if not base.lower().endswith(".pdf"):
                base += ".pdf"
            fname = f"{sha[:16]}_{base}"
            fpath = os.path.join(out_dir, fname)

            with open(fpath, "wb") as f:
                f.write(raw)

            if meta:
                rec = dict(meta)
                rec.update({"sha256": sha, "path": fpath})
                append_manifest(out_dir, rec)

            return fpath
    except Exception:
        return None


def retry(fn, args, retries=3, sleep=0.6, **kwargs):
    for i in range(retries):
        out = fn(*args, **kwargs)
        if out:
            return out
        time.sleep(sleep * (2**i))
    return None


def download_one(entry: Union[str, Dict[str, Any]], out_dir: str, manifest: Dict):
    if isinstance(entry, dict):
        return download_pdf(
            entry["url"],
            out_dir,
            suggested_name=entry.get("name"),
            meta=entry.get("meta"),
            manifest=manifest,
        )
    return download_pdf(entry, out_dir, manifest=manifest)


def parallel_download_pdfs(
    entries: List[Union[str, Dict[str, Any]]],
    out_dir: str,
    manifest: Dict[str, Dict[str, Any]],
    max_workers: int = 12,
    desc: str = "Downloading PDFs",
) -> List[str]:
    ensure_dir(out_dir)
    results: List[str] = []
    if not entries:
        return results
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = [ex.submit(retry, download_one, (e, out_dir, manifest)) for e in entries]
        for f in tqdm(as_completed(futs), total=len(futs), desc=desc):
            p = f.result()
            if p:
                results.append(p)
    return results


# --------------------------------------------------------------------------------------
# ARXIV
# --------------------------------------------------------------------------------------
def arxiv_query_from_keywords(keywords: List[str]) -> str:
    kw = [k.replace(" ", "+") for k in keywords]
    terms = " OR ".join([f"ti:{k}" for k in kw] + [f"abs:{k}" for k in kw])
    cats = (
        "cat:cond-mat.mtrl-sci OR cat:cond-mat.soft OR cat:physics.chem-ph OR cat:cs.LG OR cat:stat.ML"
    )
    return f"({terms}) AND ({cats})"


def fetch_arxiv_pdf_urls(keywords: List[str], max_results: int = 800) -> List[str]:
    """
    Extract explicit pdf links and fallback to building from id entries.
    """
    query = arxiv_query_from_keywords(keywords)
    params = {
        "search_query": query,
        "start": 0,
        "max_results": max_results,
        "sortBy": "submittedDate",
        "sortOrder": "descending",
    }
    headers = {"User-Agent": f"polymer-rag/1.0 ({DEFAULT_MAILTO})"}
    try:
        resp = requests.get(ARXIV_SEARCH_URL, params=params, headers=headers, timeout=60)
        resp.raise_for_status()
        xml = resp.text
    except Exception:
        return []

    pdfs: List[str] = []
    seen = set()

    # explicit pdf hrefs
    for p in re.findall(r'href="(https?://arxiv\.org/pdf[^"]*)"', xml):
        if p not in seen:
            pdfs.append(p)
            seen.add(p)

    # fallback: build from id entries
    for aid in re.findall(r'<id>(https?://arxiv\.org/abs[^<]*)</id>', xml):
        m = re.search(r"arxiv\.org/abs/([^?v]+)", aid)
        if m:
            identifier = m.group(1)
            pdf = f"https://arxiv.org/pdf/{identifier}.pdf"
            if pdf not in seen:
                pdfs.append(pdf)
                seen.add(pdf)

    return pdfs


def fetch_arxiv_pdfs(
    keywords: List[str],
    out_dir: str,
    manifest: Dict[str, Dict[str, Any]],
    max_results: int = 800,
) -> List[str]:
    urls = fetch_arxiv_pdf_urls(keywords, max_results=max_results)
    entries = [
        {
            "url": u,
            "name": u.rstrip("/").split("/")[-1],
            "meta": {"source": "arxiv", "url": u},
        }
        for u in urls
    ]
    paths = parallel_download_pdfs(entries, out_dir, manifest, max_workers=8, desc="arXiv PDFs")
    return paths


# --------------------------------------------------------------------------------------
# OPENALEX
# --------------------------------------------------------------------------------------
def openalex_fetch_works_try(
    search: str,
    filter_str: str,
    per_page: int,
    page: int,
    mailto: Optional[str],
) -> Dict[str, Any]:
    headers = {"User-Agent": f"polymer-rag/1.0 ({mailto or DEFAULT_MAILTO})"}
    params: Dict[str, Any] = {
        "search": search,
        "per-page": per_page,
        "per_page": per_page,
        "page": page,
        "sort": "publication_date:desc",
    }
    if filter_str:
        params["filter"] = filter_str
    if mailto:
        params["mailto"] = mailto

    resp = requests.get(OPENALEX_WORKS_URL, params=params, headers=headers, timeout=60)
    resp.raise_for_status()
    return resp.json()


def openalex_fetch_works(
    keywords: List[str],
    max_results: int = 600,
    per_page: int = 200,
    mailto: Optional[str] = None,
) -> List[Dict[str, Any]]:
    """
    Try multiple query forms with relaxed filters if needed.
    """
    kws = sorted(set(keywords or []), key=str.lower)
    combined = " ".join(kws)
    or_query = " OR ".join(kws)

    attempts = [
        {"q": combined, "filter": "is_oa:true,language:en"},
        {"q": or_query, "filter": "is_oa:true,language:en"},
        {"q": or_query, "filter": "is_oa:true"},
        {"q": or_query, "filter": ""},
    ]

    works: List[Dict[str, Any]] = []
    for attempt in attempts:
        search = attempt["q"]
        filter_str = attempt["filter"]
        page = 1
        while len(works) < max_results:
            try:
                data = openalex_fetch_works_try(
                    search, filter_str, per_page, page, mailto or DEFAULT_MAILTO
                )
            except Exception as e:
                print(f"[WARN] OpenAlex request failed: {e}")
                break

            results = data.get("results", [])
            if not results:
                break

            works.extend(results)
            if len(results) < per_page:
                break
            page += 1
            time.sleep(0.12)

        if len(works) >= max_results:
            break
        if works:
            break

    return works[:max_results]


def openalex_extract_pdf_entries(
    works: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """
    Extract candidate PDF URLs and metadata from OpenAlex works.
    """
    out: List[Dict[str, Any]] = []
    seen_urls = set()

    for w in works:
        pdf = ""
        best = w.get("best_oa_location") or {}
        if isinstance(best, dict):
            pdf = best.get("pdf_url") or best.get("url_for_pdf") or best.get("url") or ""
        if not pdf:
            pl = w.get("primary_location") or {}
            if isinstance(pl, dict):
                pdf = (
                    pl.get("pdf_url")
                    or pl.get("url_for_pdf")
                    or pl.get("landing_page_url")
                    or ""
                )
        if not pdf:
            oa = w.get("open_access") or {}
            if isinstance(oa, dict):
                pdf = oa.get("oa_url") or oa.get("oa_url_for_pdf") or ""
        if not pdf or pdf in seen_urls:
            continue
        seen_urls.add(pdf)

        title = (w.get("title") or w.get("display_name") or "").strip()
        year = w.get("publication_year") or w.get("publication_date") or ""
        venue = ""
        pl = w.get("primary_location") or {}
        if isinstance(pl, dict):
            venue = (pl.get("source") or {}).get("display_name") or ""
        if not venue:
            venue = (w.get("host_venue") or {}).get("display_name") or "".strip()

        name = " - ".join([s for s in [title, venue, str(year) or ""] if s])

        meta = {"title": title, "year": year, "venue": venue, "source": "openalex"}
        out.append({"url": pdf, "name": name, "meta": meta})

    return out


def fetch_openalex_pdfs(
    keywords: List[str],
    out_dir: str,
    manifest: Dict[str, Dict[str, Any]],
    max_results: int = 600,
    mailto: Optional[str] = None,
) -> List[str]:
    works = openalex_fetch_works(keywords, max_results=max_results, mailto=mailto)
    if not works:
        print("[INFO] OpenAlex returned no works for given queries/filters.")
        return []

    entries = openalex_extract_pdf_entries(works)
    if not entries:
        print("[INFO] OpenAlex works found, but no PDF links extracted.")
        return []

    paths = parallel_download_pdfs(
        entries, out_dir, manifest, max_workers=16, desc="OpenAlex PDFs"
    )
    return paths


# --------------------------------------------------------------------------------------
# EUROPE PMC
# --------------------------------------------------------------------------------------
def epmc_query_from_keywords(keywords: List[str]) -> str:
    return " OR ".join([f'"{k}"' for k in keywords])


def epmc_extract_pdf_entries_from_results(
    results: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    out: List[Dict[str, Any]] = []
    seen = set()

    for r in results:
        ftl = r.get("fullTextUrlList") or {}
        urls: List[str] = []
        if isinstance(ftl, dict):
            for ful in ftl.get("fullTextUrl") or []:
                if isinstance(ful, dict):
                    u = ful.get("url") or ""
                    if u:
                        urls.append(u)
        if not urls:
            fu = r.get("fullTextUrl")
            if isinstance(fu, str) and fu:
                urls.append(fu)

        for u in urls:
            if not u or u in seen:
                continue
            seen.add(u)

            title = r.get("title") or "".strip()
            year = r.get("firstPublicationDate") or r.get("pubYear") or ""
            name = " - ".join([s for s in [title, str(year) or ""] if s])

            out.append(
                {
                    "url": u,
                    "name": name,
                    "meta": {"title": title, "year": year, "source": "epmc"},
                }
            )

    return out


def fetch_epmc_pdfs(
    keywords: List[str],
    out_dir: str,
    manifest: Dict[str, Dict[str, Any]],
    max_results: int = 200,
    page_size: int = 25,
) -> List[str]:
    """
    Query Europe PMC and extract fullTextUrlList entries.
    """
    q = epmc_query_from_keywords(keywords)
    params = {
        "query": q,
        "format": "json",
        "pageSize": page_size,
        "sort": "FIRST_PDATE desc",
    }
    headers = {"User-Agent": f"polymer-rag/1.0 ({DEFAULT_MAILTO})"}
    saved: List[str] = []
    cursor = 1
    total_fetched = 0

    while total_fetched < max_results:
        params["page"] = cursor
        try:
            resp = requests.get(EPMC_SEARCH_URL, params=params, headers=headers, timeout=30)
            resp.raise_for_status()
            data = resp.json()
        except Exception as e:
            print(f"[WARN] Europe PMC request failed: {e}")
            break

        results = (data.get("resultList") or {}).get("result") or []
        if not results:
            break

        entries = epmc_extract_pdf_entries_from_results(results)
        if not entries:
            cursor += 1
            total_fetched += len(results)
            time.sleep(0.2)
            continue

        paths = parallel_download_pdfs(entries, out_dir, manifest, max_workers=8, desc="Europe PMC PDFs")
        saved.extend(paths)

        total_fetched += len(results)
        cursor += 1
        time.sleep(0.2)

    return saved


# --------------------------------------------------------------------------------------
# POLYMER JOURNALS OA
# --------------------------------------------------------------------------------------
def fetch_polymer_journal_pdfs(
    journal_queries: List[Dict[str, Any]],
    out_dir: str,
    manifest: Dict[str, Dict[str, Any]],
    max_per_journal: int = 50,
    mailto: Optional[str] = None,
) -> List[str]:
    """
    Fetch OA papers from specific polymer journals via OpenAlex.
    """
    all_paths: List[str] = []
    for jq in journal_queries:
        journal_name = jq["journal"]
        issn = jq.get("issn", "")
        publisher = jq.get("publisher", "")
        print(f"→ Fetching from {journal_name} ({publisher})...")

        # Build OpenAlex filter for this journal
        filter_parts = ["is_oa:true", "language:en"]
        if issn:
            filter_parts.append(f"primary_location.source.issn:{issn}")
        filter_str = ",".join(filter_parts)

        # Search for polymer-related content in this journal
        search_query = "polymer OR macromolecule OR copolymer"
        page = 1
        journal_works = []
        while len(journal_works) < max_per_journal:
            try:
                data = openalex_fetch_works_try(
                    search_query, filter_str, 25, page, mailto or DEFAULT_MAILTO
                )
            except Exception as e:
                print(f"[WARN] Failed to fetch {journal_name}: {e}")
                break

            results = data.get("results", [])
            if not results:
                break
            journal_works.extend(results)
            if len(results) < 25:
                break
            page += 1
            time.sleep(0.15)

        if journal_works:
            entries = openalex_extract_pdf_entries(journal_works[:max_per_journal])
            # Tag with journal source
            for e in entries:
                e["meta"]["journal"] = journal_name
                e["meta"]["publisher"] = publisher
                e["meta"]["source"] = f"{journal_name}_{publisher}".lower()

            paths = parallel_download_pdfs(
                entries, out_dir, manifest, max_workers=8, desc=f"{journal_name} PDFs"
            )
            all_paths.extend(paths)
            print(f"  → Downloaded {len(paths)} PDFs from {journal_name}")
            time.sleep(0.3)

    return all_paths


# --------------------------------------------------------------------------------------
# WRAPPER FOR OPENAI EMBEDDINGS
# --------------------------------------------------------------------------------------
class PolymerStyleOpenAIEmbeddings(OpenAIEmbeddings):
    """
    OpenAI embeddings wrapper for polymer RAG.
    Default model: text-embedding-3-small (1536-D) ← FIXED
    """

    def __init__(self, model: str = "text-embedding-3-small", **kwargs):
        super().__init__(model=model, **kwargs)


# --------------------------------------------------------------------------------------
# TOKENIZER FOR TRUE TOKEN-BASED SEGMENTATION
# --------------------------------------------------------------------------------------
TOKENIZER = tiktoken.get_encoding("cl100k_base")


def token_length(text: str) -> int:
    if not text:
        return 0
    return len(TOKENIZER.encode(text))


# --------------------------------------------------------------------------------------
# METADATA ENRICHMENT FROM MANIFEST
# --------------------------------------------------------------------------------------
def attach_extra_metadata_from_manifest(
    docs: List[Any], manifest: Dict[str, Dict[str, Any]]
) -> None:
    """
    Enrich Document metadata with manifest data for later citation.
    """
    for d in docs:
        src_path = d.metadata.get("source", "")
        if not src_path:
            continue

        rec = manifest.get(src_path)
        if not rec:
            for k, v in manifest.items():
                if os.path.basename(k) == os.path.basename(src_path):
                    rec = v
                    break
        if rec:
            for k in ["title", "year", "venue", "url", "source", "journal", "publisher"]:
                if k in rec:
                    d.metadata[k] = rec[k]


# --------------------------------------------------------------------------------------
# MULTI-SCALE CHUNKING
# --------------------------------------------------------------------------------------
def multiscale_chunk_documents(
    docs: List[Any], min_chunk_tokens: int = 32
) -> List[Any]:
    """
    Multi-scale segmentation at TOKEN level: 512, 256, 128 token windows.
    """
    splitter_specs = [
        ("tokens=512", 512, 64),  # 50% tokens overlap
        ("tokens=256", 256, 48),
        ("tokens=128", 128, 32),
    ]

    all_chunks: List[Any] = []
    seg_id = 0

    for scale_label, chunk_size, overlap in splitter_specs:
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap,
            length_function=token_length,
            separators=["\n\n", "\n", ". ", " ", ""],
        )
        splits = splitter.split_documents(docs)
        for d in splits:
            if token_length(d.page_content or "") < min_chunk_tokens:
                continue
            d.metadata = dict(d.metadata or {})
            d.metadata["segment_scale"] = scale_label
            d.metadata["segment_id"] = seg_id
            seg_id += 1
            all_chunks.append(d)

    return all_chunks


# --------------------------------------------------------------------------------------
# BUILD RETRIEVER FROM LOCAL PDFs
# --------------------------------------------------------------------------------------
def _split_and_build_retriever(
    documents_dir: str,
    persist_dir: Optional[str] = None,
    k: int = 10,
    embedding_model: str = "text-embedding-3-small",
    vector_backend: str = "chroma",
    min_chunk_tokens: int = 32,
    api_key: Optional[str] = None,
):
    """
    Load PDFs, chunk multi-scale, build dense retriever.
    FIXED: Always uses text-embedding-3-small (1536-D) and handles existing DB correctly.
    """
    print(f"→ Loading PDFs from {documents_dir}...")
    try:
        loader = DirectoryLoader(
            documents_dir,
            glob="*.pdf",
            loader_cls=PyPDFLoader,
            show_progress=True,
            use_multithreading=True,
            silent_errors=True,
        )
    except TypeError:
        loader = DirectoryLoader(
            documents_dir,
            glob="*.pdf",
            loader_cls=PyPDFLoader,
            show_progress=True,
            use_multithreading=True,
        )

    docs = loader.load()
    if not docs:
        raise RuntimeError("No PDF documents found to index.")

    manifest = load_manifest(documents_dir)
    attach_extra_metadata_from_manifest(docs, manifest)

    documents = multiscale_chunk_documents(docs, min_chunk_tokens=min_chunk_tokens)
    print(
        f"→ Created {len(documents)} multi-scale segments from {len(docs)} PDFs (512/256/128-token windows)."
    )

    print(f"→ Using OpenAI embeddings model: {embedding_model}")
    embeddings = PolymerStyleOpenAIEmbeddings(model=embedding_model, api_key=api_key)

    if vector_backend.lower() == "chroma":
        if persist_dir and os.path.exists(persist_dir):
            print(f"→ Deleting existing Chroma database at {persist_dir} to prevent dimension mismatch...")
            import shutil
            shutil.rmtree(persist_dir)
            print(f"→ Existing database deleted. Creating fresh database...")

        # Sanitize all text content to prevent Unicode errors
        for doc in documents:
            doc.page_content = sanitize_text(doc.page_content or "")
            for key, value in doc.metadata.items():
                if isinstance(value, str):
                    doc.metadata[key] = sanitize_text(value)

        # Process in batches to avoid rate limiting and memory issues
        batch_size = 500  # Adjust based on your document sizes (500 is safe for most cases)
        total_batches = (len(documents) + batch_size - 1) // batch_size
        print(f"→ Processing {len(documents)} documents in {total_batches} batches of {batch_size}...")

        vector_store = None
        for i in range(0, len(documents), batch_size):
            batch = documents[i : i + batch_size]
            batch_num = (i // batch_size) + 1
            print(f"  → Embedding batch {batch_num}/{total_batches} ({len(batch)} documents)...")

            if vector_store is None:
                # First batch: create the vector store
                if persist_dir:
                    print(f"    → Creating new Chroma database at {persist_dir}")
                    vector_store = Chroma.from_documents(
                        batch, embeddings, persist_directory=persist_dir
                    )
                else:
                    # In-memory mode also needs batching
                    vector_store = Chroma.from_documents(batch, embeddings)
            else:
                # Subsequent batches: add to existing store
                vector_store.add_documents(batch)

            time.sleep(0.5)  # Small delay to avoid rate limiting

    elif vector_backend.lower() == "faiss":
        try:
            from langchain_community.vectorstores import FAISS
        except Exception as e:
            raise RuntimeError("FAISS requested but not available") from e

        # Sanitize all text content
        for doc in documents:
            doc.page_content = sanitize_text(doc.page_content or "")
            for key, value in doc.metadata.items():
                if isinstance(value, str):
                    doc.metadata[key] = sanitize_text(value)

        # FAISS also needs batching
        batch_size = 500
        total_batches = (len(documents) + batch_size - 1) // batch_size
        print(f"→ Processing {len(documents)} documents in {total_batches} batches of {batch_size}...")

        vector_store = None
        for i in range(0, len(documents), batch_size):
            batch = documents[i : i + batch_size]
            batch_num = (i // batch_size) + 1
            print(f"  → Embedding batch {batch_num}/{total_batches} ({len(batch)} documents)...")

            if vector_store is None:
                vector_store = FAISS.from_documents(batch, embeddings)
            else:
                batch_store = FAISS.from_documents(batch, embeddings)
                vector_store.merge_from(batch_store)

            time.sleep(0.5)

    else:
        raise ValueError("vector_backend must be 'chroma' or 'faiss'")

    vector_retriever = vector_store.as_retriever(search_kwargs={"k": k})
    print("→ RAG KB ready (dense retriever over multi-scale segments).")
    return vector_retriever


# --------------------------------------------------------------------------------------
# PUBLIC API: BUILD RETRIEVER FROM WEB
# --------------------------------------------------------------------------------------
def build_retriever_from_web(
    polymer_keywords: Optional[List[str]] = None,
    target_curated: int = TARGET_CURATED,
    target_journals: int = TARGET_JOURNALS,
    target_arxiv: int = TARGET_ARXIV,
    target_openalex: int = TARGET_OPENALEX,
    target_epmc: int = TARGET_EPMC,
    extra_pdf_urls: Optional[List[str]] = None,
    persist_dir: str = DEFAULT_PERSIST_DIR,
    tmp_download_dir: str = DEFAULT_TMP_DOWNLOAD_DIR,
    k: int = 10,
    embedding_model: str = "text-embedding-3-small",
    vector_backend: str = "chroma",
    mailto: Optional[str] = None,
    include_curated: bool = True,
):
    """
    Fetch balanced polymer corpus across multiple sources.

    Target distribution (~2000 PDFs):
    - Curated guidelines/standards: 100
    - Polymer journals OA: 200
    - arXiv: 800
    - OpenAlex: 600
    - Europe PMC: 200
    - Extra/databases: 100
    """
    polymer_keywords = sorted(set(polymer_keywords or POLYMER_KEYWORDS), key=str.lower)
    print("=" * 70)
    print("Fetching polymer PDFs from balanced sources...")
    print(
        f"Target: {target_curated} curated + {target_journals} journals + "
        f"{target_arxiv} arXiv + {target_openalex} OpenAlex + {target_epmc} EPMC"
    )

    ensure_dir(tmp_download_dir)
    manifest = load_manifest(tmp_download_dir)
    source_stats = defaultdict(int)
    all_paths: List[str] = []

    # --------------------------------------------------------------------------------------
    # 1) Curated sources (IUPAC, ISO/ASTM, polymer informatics reviews)
    # --------------------------------------------------------------------------------------
    if include_curated and CURATED_POLYMER_PDF_SOURCES:
        print(f"[1/6] Downloading {len(CURATED_POLYMER_PDF_SOURCES)} curated PDFs...")
        curated_paths = parallel_download_pdfs(
            CURATED_POLYMER_PDF_SOURCES[:target_curated],
            tmp_download_dir,
            manifest,
            max_workers=4,
            desc="Curated PDFs",
        )
        for p in curated_paths:
            if p not in all_paths:
                all_paths.append(p)
                source_stats["curated"] += 1
        print(f"  → {len(curated_paths)} curated PDFs downloaded")

    # --------------------------------------------------------------------------------------
    # 2) Polymer journals OA
    # --------------------------------------------------------------------------------------
    try:
        print(f"[2/6] Fetching polymer journal PDFs (target: {target_journals})...")
        journal_paths = fetch_polymer_journal_pdfs(
            POLYMER_JOURNAL_QUERIES,
            tmp_download_dir,
            manifest,
            max_per_journal=target_journals // len(POLYMER_JOURNAL_QUERIES) + 1,
            mailto=mailto,
        )
        for p in journal_paths:
            if p not in all_paths:
                all_paths.append(p)
                source_stats["journal"] += 1
        print(f"  → {len(journal_paths)} journal PDFs downloaded")
    except Exception as e:
        print(f"[WARN] Polymer journal fetch error: {e}")

    # --------------------------------------------------------------------------------------
    # 3) arXiv polymer-focused categories
    # --------------------------------------------------------------------------------------
    try:
        print(f"[3/6] Fetching arXiv PDFs (target: {target_arxiv})...")
        arxiv_paths = fetch_arxiv_pdfs(
            polymer_keywords, tmp_download_dir, manifest, max_results=target_arxiv
        )
        for p in arxiv_paths:
            if p not in all_paths:
                all_paths.append(p)
                source_stats["arxiv"] += 1
        print(f"  → {len(arxiv_paths)} arXiv PDFs downloaded")
    except Exception as e:
        print(f"[WARN] arXiv fetch error: {e}")

    # --------------------------------------------------------------------------------------
    # 4) OpenAlex broad polymer search
    # --------------------------------------------------------------------------------------
    try:
        print(f"[4/6] Fetching OpenAlex PDFs (target: {target_openalex})...")
        openalex_paths = fetch_openalex_pdfs(
            polymer_keywords,
            tmp_download_dir,
            manifest,
            max_results=target_openalex,
            mailto=mailto,
        )
        for p in openalex_paths:
            if p not in all_paths:
                all_paths.append(p)
                source_stats["openalex"] += 1
        print(f"  → {len(openalex_paths)} OpenAlex PDFs downloaded")
    except Exception as e:
        print(f"[WARN] OpenAlex fetch error: {e}")

    # --------------------------------------------------------------------------------------
    # 5) Europe PMC biopolymers/materials
    # --------------------------------------------------------------------------------------
    try:
        print(f"[5/6] Fetching Europe PMC PDFs (target: {target_epmc})...")
        epmc_paths = fetch_epmc_pdfs(
            polymer_keywords, tmp_download_dir, manifest, max_results=target_epmc
        )
        for p in epmc_paths:
            if p not in all_paths:
                all_paths.append(p)
                source_stats["epmc"] += 1
        print(f"  → {len(epmc_paths)} Europe PMC PDFs downloaded")
    except Exception as e:
        print(f"[WARN] Europe PMC fetch error: {e}")

    # --------------------------------------------------------------------------------------
    # 6) Extra URLs (user-provided, database exports, etc.)
    # --------------------------------------------------------------------------------------
    if extra_pdf_urls:
        print(f"[6/6] Downloading {len(extra_pdf_urls)} extra PDFs...")
        extra_entries = [
            {"url": u, "name": None, "meta": {"url": u, "source": "extra"}}
            for u in extra_pdf_urls
        ]
        extra_paths = parallel_download_pdfs(
            extra_entries, tmp_download_dir, manifest, max_workers=8, desc="Extra PDFs"
        )
        for p in extra_paths:
            if p not in all_paths:
                all_paths.append(p)
                source_stats["extra"] += 1
        print(f"  → {len(extra_paths)} extra PDFs downloaded")

    # --------------------------------------------------------------------------------------
    # Summary
    # --------------------------------------------------------------------------------------
    total = len(all_paths)
    print("=" * 70)
    print("DOWNLOAD SUMMARY")
    print("=" * 70)
    print(f"Total unique PDFs downloaded: {total}")
    print(" by source:")
    for source, count in sorted(source_stats.items()):
        pct = (count / total * 100) if total > 0 else 0
        print(f"  {source:20s} {count:4d} PDFs ({pct:5.1f}%)")
    print("=" * 70)

    if total == 0:
        raise RuntimeError(
            "No PDFs fetched. Adjust keywords, targets, or add extra_pdf_urls."
        )

    print("Building knowledge base from downloaded PDFs...")
    retriever = _split_and_build_retriever(
        documents_dir=tmp_download_dir,
        persist_dir=persist_dir,
        k=k,
        embedding_model=embedding_model,
        vector_backend=vector_backend,
    )

    return retriever


# --------------------------------------------------------------------------------------
# PUBLIC API: BUILD RETRIEVER FROM LOCAL PAPERS
# --------------------------------------------------------------------------------------
def build_retriever(
    papers_path: str,
    persist_dir: Optional[str] = DEFAULT_PERSIST_DIR,
    k: int = 10,
    embedding_model: str = "text-embedding-3-small",
    vector_backend: str = "chroma",
):
    """
    Build polymer RAG KB from local PDFs.
    """
    print("Building RAG knowledge base from local PDFs...")
    return _split_and_build_retriever(
        documents_dir=papers_path,
        persist_dir=persist_dir,
        k=k,
        embedding_model=embedding_model,
        vector_backend=vector_backend,
    )


# --------------------------------------------------------------------------------------
# CONVENIENCE WRAPPER: POLYMER FOUNDATION MODELS
# --------------------------------------------------------------------------------------
def build_retriever_polymer_foundation_models(
    persist_dir: str = DEFAULT_PERSIST_DIR,
    k: int = 10,
    vector_backend: str = "chroma",
):
    """
    Convenience wrapper for polymer foundation model corpus.
    """
    fm_kw = list(
        set(POLYMER_KEYWORDS)
        | {
            "BigSMILES",
            "PSMILES",
            "polymer SMILES",
            "polymer language model",
            "foundation model polymer",
            "masked language model polymer",
            "self-supervised polymer",
            "generative polymer",
            "polymer sequence modeling",
            "representation learning polymer",
        }
    )
    return build_retriever_from_web(
        polymer_keywords=fm_kw,
        target_curated=100,
        target_journals=200,
        target_arxiv=800,
        target_openalex=600,
        target_epmc=200,
        persist_dir=persist_dir,
        k=k,
        embedding_model="text-embedding-3-small",
        vector_backend=vector_backend,
    )


# --------------------------------------------------------------------------------------
# MAIN
# --------------------------------------------------------------------------------------
if __name__ == "__main__":
    retriever = build_retriever_from_web(
        polymer_keywords=POLYMER_KEYWORDS,
        target_curated=100,
        target_journals=200,
        target_arxiv=800,
        target_openalex=600,
        target_epmc=200,
        persist_dir="chroma_polymer_db_balanced",
        tmp_download_dir=DEFAULT_TMP_DOWNLOAD_DIR,
        k=10,
        embedding_model="text-embedding-3-small",
        vector_backend="chroma",
        mailto=DEFAULT_MAILTO,
        include_curated=True,
    )

    print("\n" + "=" * 70)
    print("Testing retrieval with sample query")
    docs = retriever.get_relevant_documents("PSMILES polymer electrolyte design")
    for i, d in enumerate(docs, 1):
        meta = d.metadata or {}
        title = meta.get("title") or os.path.basename(meta.get("source", "")) or "document"
        year = meta.get("year", "")
        src = meta.get("source", "unknown")
        journal = meta.get("journal", "")
        scale = meta.get("segment_scale", "")
        source_str = f"{src}"
        if journal:
            source_str = f"{journal} ({src})"
        print(f"\n[{i}] {title}")
        print(f"    Year: {year} | Source: {source_str} | Scale: {scale}")
        print(f"    Content: {(d.page_content or '')[:200]}...")