Spaces:
Running
Running
| from __future__ import annotations | |
| import os | |
| import re | |
| import time | |
| import json | |
| import hashlib | |
| import pathlib | |
| import tempfile | |
| from typing import List, Optional, Dict, Any, Union | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from collections import defaultdict | |
| import requests | |
| from tqdm import tqdm | |
| # -------------------------------------------------------------------------------------- | |
| # Vector store, loaders, splitters | |
| # -------------------------------------------------------------------------------------- | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader | |
| # -------------------------------------------------------------------------------------- | |
| # OpenAI embeddings | |
| # -------------------------------------------------------------------------------------- | |
| from langchain_openai import OpenAIEmbeddings | |
| # -------------------------------------------------------------------------------------- | |
| # Tokenizer for true token-based multi-scale segmentation | |
| # -------------------------------------------------------------------------------------- | |
| import tiktoken | |
| def sanitize_text(text: str) -> str: | |
| """ | |
| Remove surrogate pairs and invalid Unicode characters. | |
| Prevents UnicodeEncodeError when adding documents to ChromaDB. | |
| """ | |
| if not text: | |
| return text | |
| # Replace surrogates and invalid chars with empty string | |
| return text.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore") | |
| # -------------------------------------------------------------------------------------- | |
| # ARXIV, OPENALEX, EPMC API URLS | |
| # -------------------------------------------------------------------------------------- | |
| ARXIV_SEARCH_URL = "http://export.arxiv.org/api/query" | |
| OPENALEX_WORKS_URL = "https://api.openalex.org/works" | |
| EPMC_SEARCH_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search" | |
| DEFAULT_PERSIST_DIR = "chroma_polymer_db" | |
| DEFAULT_TMP_DOWNLOAD_DIR = os.path.join(tempfile.gettempdir(), "polymer_rag_pdfs") | |
| MANIFEST_NAME = "manifest.jsonl" | |
| # -------------------------------------------------------------------------------------- | |
| # Balanced target distribution | |
| # -------------------------------------------------------------------------------------- | |
| TARGET_CURATED = 100 | |
| TARGET_JOURNALS = 200 | |
| TARGET_ARXIV = 800 | |
| TARGET_OPENALEX = 600 | |
| TARGET_EPMC = 200 | |
| TARGET_DATABASES = 100 | |
| # -------------------------------------------------------------------------------------- | |
| # Polymer keywords | |
| # -------------------------------------------------------------------------------------- | |
| POLYMER_KEYWORDS = [ | |
| "polymer", | |
| "macromolecule", | |
| "macromolecular", | |
| "polymeric", | |
| "polymer informatics", | |
| "polymer chemistry", | |
| "polymer physics", | |
| "PSMILES", | |
| "pSMILES", | |
| "BigSMILES", | |
| "polymer SMILES", | |
| "polymer sequence", | |
| "polymer electrolyte", | |
| "polymer morphology", | |
| "polymer dielectric", | |
| "polymer electrolyte membrane", | |
| "block copolymer", | |
| "biopolymer", | |
| "polymer nanocomposite", | |
| "polymer foundation model", | |
| "self-supervised polymer", | |
| "masked language model polymer", | |
| "polymer transformer", | |
| "generative polymer", | |
| "copolymer", | |
| "polymerization", | |
| "polymer synthesis", | |
| "polymer characterization", | |
| ] | |
| # -------------------------------------------------------------------------------------- | |
| # IUPAC Guidelines & Standards (polymer nomenclature and terminology standards) | |
| # -------------------------------------------------------------------------------------- | |
| CURATED_IUPAC_STANDARDS: List[Dict[str, Any]] = [ | |
| { | |
| "url": "https://iupac.org/wp-content/uploads/2019/07/140-Brief-Guide-to-Polymer-Nomenclature-Web-Final-d.pdf", | |
| "name": "IUPAC - Brief Guide to Polymer Nomenclature", | |
| "meta": { | |
| "title": "A Brief Guide to Polymer Nomenclature (IUPAC Technical Report)", | |
| "year": "2012", | |
| "venue": "IUPAC Pure and Applied Chemistry", | |
| "source": "curated_iupac_standard", | |
| }, | |
| }, | |
| { | |
| "url": "https://rseq.org/wp-content/uploads/2022/10/20220816-English-BriefGuidePolymerTerminology-IUPAC.pdf", | |
| "name": "IUPAC - Brief Guide to Polymerization Terminology", | |
| "meta": { | |
| "title": "A Brief Guide to Polymerization Terminology (IUPAC Recommendations)", | |
| "year": "2022", | |
| "venue": "IUPAC", | |
| "source": "curated_iupac_standard", | |
| }, | |
| }, | |
| { | |
| "url": "https://www.rsc.org/images/richard-jones-naming-polymers_tcm18-243646.pdf", | |
| "name": "RSC - Naming Polymers", | |
| "meta": { | |
| "title": "Naming Polymers (RSC Educational Resource)", | |
| "year": "2020", | |
| "venue": "Royal Society of Chemistry", | |
| "source": "curated_iupac_standard", | |
| }, | |
| }, | |
| ] | |
| # -------------------------------------------------------------------------------------- | |
| # ISO/ASTM Standards (polymer testing and characterization standards) | |
| # -------------------------------------------------------------------------------------- | |
| CURATED_ISO_ASTM_STANDARDS: List[Dict[str, Any]] = [ | |
| { | |
| "url": "https://cdn.standards.iteh.ai/samples/76910/29c8e7af07bd4188b297c39684ada79e/ISO-ASTM-52925-2022.pdf", | |
| "name": "ISO/ASTM 52925:2022 - Additive Manufacturing Polymers", | |
| "meta": { | |
| "title": "ISO/ASTM 52925:2022 Additive manufacturing of polymers - Feedstock materials", | |
| "year": "2022", | |
| "venue": "ISO/ASTM", | |
| "source": "curated_iso_astm_standard", | |
| }, | |
| }, | |
| { | |
| "url": "https://cdn.standards.iteh.ai/samples/76909/b9883b2f204248aca175e2f574bd879c/ISO-ASTM-52924-2023.pdf", | |
| "name": "ISO/ASTM 52924:2023 - Additive Manufacturing Qualification", | |
| "meta": { | |
| "title": "ISO/ASTM 52924:2023 Additive manufacturing of polymers - Qualification principles", | |
| "year": "2023", | |
| "venue": "ISO/ASTM", | |
| "source": "curated_iso_astm_standard", | |
| }, | |
| }, | |
| { | |
| "url": "https://nvlpubs.nist.gov/nistpubs/ir/2015/NIST.IR.8059.pdf", | |
| "name": "NIST IR 8059 - Materials Testing Standards for Additive Manufacturing", | |
| "meta": { | |
| "title": "Materials Testing Standards for Additive Manufacturing of Polymer Materials", | |
| "year": "2015", | |
| "venue": "NIST", | |
| "source": "curated_iso_astm_standard", | |
| }, | |
| }, | |
| ] | |
| # -------------------------------------------------------------------------------------- | |
| # Foundational polymer informatics papers | |
| # -------------------------------------------------------------------------------------- | |
| CURATED_POLYMER_INFORMATICS: List[Dict[str, Any]] = [ | |
| { | |
| "url": "https://ramprasad.mse.gatech.edu/wp-content/uploads/2021/01/polymer-informatics.pdf", | |
| "name": "Polymer Informatics - Current Status and Critical Next Steps", | |
| "meta": { | |
| "title": "Polymer informatics: Current status and critical next steps", | |
| "year": "2020", | |
| "venue": "Materials Science and Engineering: R", | |
| "source": "curated_review_informatics", | |
| }, | |
| }, | |
| { | |
| "url": "https://arxiv.org/pdf/2011.00508.pdf", | |
| "name": "Polymer Informatics - Current Status (arXiv)", | |
| "meta": { | |
| "title": "Polymer Informatics: Current Status and Critical Next Steps", | |
| "year": "2020", | |
| "venue": "arXiv:2011.00508", | |
| "source": "curated_review_informatics", | |
| }, | |
| }, | |
| ] | |
| # -------------------------------------------------------------------------------------- | |
| # BigSMILES notation papers (polymer representation standards) | |
| # -------------------------------------------------------------------------------------- | |
| CURATED_BIGSMILES: List[Dict[str, Any]] = [ | |
| { | |
| "url": "https://pubs.acs.org/doi/pdf/10.1021/acscentsci.9b00476", | |
| "name": "BigSMILES - Structurally-Based Line Notation", | |
| "meta": { | |
| "title": "BigSMILES: A Structurally-Based Line Notation for Describing Macromolecules", | |
| "year": "2019", | |
| "venue": "ACS Central Science", | |
| "source": "curated_bigsmiles", | |
| }, | |
| }, | |
| { | |
| "url": "https://www.rsc.org/suppdata/d3/dd/d3dd00147d/d3dd00147d1.pdf", | |
| "name": "Generative BigSMILES - Supplementary Information", | |
| "meta": { | |
| "title": "Generative BigSMILES: an extension for polymer informatics (SI)", | |
| "year": "2024", | |
| "venue": "RSC Digital Discovery", | |
| "source": "curated_bigsmiles", | |
| }, | |
| }, | |
| ] | |
| # -------------------------------------------------------------------------------------- | |
| # Combine all curated sources | |
| # -------------------------------------------------------------------------------------- | |
| CURATED_POLYMER_PDF_SOURCES = ( | |
| CURATED_IUPAC_STANDARDS | |
| + CURATED_ISO_ASTM_STANDARDS | |
| + CURATED_POLYMER_INFORMATICS | |
| + CURATED_BIGSMILES | |
| ) | |
| # -------------------------------------------------------------------------------------- | |
| # Major polymer journals with OA content | |
| # -------------------------------------------------------------------------------------- | |
| POLYMER_JOURNAL_QUERIES = [ | |
| # ACS Journals | |
| {"journal": "Macromolecules", "issn": "0024-9297", "publisher": "ACS"}, | |
| {"journal": "ACS Polymers Au", "issn": "2768-1939", "publisher": "ACS"}, | |
| {"journal": "ACS Applied Polymer Materials", "issn": "2637-6105", "publisher": "ACS"}, | |
| {"journal": "Biomacromolecules", "issn": "1525-7797", "publisher": "ACS"}, | |
| {"journal": "ACS Macro Letters", "issn": "2161-1653", "publisher": "ACS"}, | |
| # RSC Journals | |
| {"journal": "Polymer Chemistry", "issn": "1759-9954", "publisher": "RSC"}, | |
| {"journal": "RSC Applied Polymers", "issn": "2755-0656", "publisher": "RSC"}, | |
| {"journal": "Soft Matter", "issn": "1744-683X", "publisher": "RSC"}, | |
| # Springer/Nature Journals | |
| {"journal": "Polymer Journal", "issn": "0032-3896", "publisher": "Nature"}, | |
| {"journal": "Journal of Polymer Science", "issn": "2642-4169", "publisher": "Wiley"}, | |
| # Additional OA Journals | |
| {"journal": "Polymer Science and Technology", "issn": "2837-0341", "publisher": "ACS"}, | |
| {"journal": "Polymers", "issn": "2073-4360", "publisher": "MDPI"}, | |
| ] | |
| DEFAULT_MAILTO = "kaur-m43@webmail.uwinnipeg.ca" # polite defaults | |
| # -------------------------------------------------------------------------------------- | |
| # DEDUPLICATION, DOWNLOAD, MANIFEST HELPERS | |
| # -------------------------------------------------------------------------------------- | |
| def sha256_bytes(data: bytes) -> str: | |
| return hashlib.sha256(data).hexdigest() | |
| def safe_filename(name: str) -> str: | |
| name = str(name or "").strip().replace("/", "_").replace("\\", "_") | |
| name = re.sub(r"[^a-zA-Z0-9._\-]", "_", name) | |
| return name[:200] | |
| def is_probably_pdf(raw: bytes, content_type: str) -> bool: | |
| if not raw: | |
| return False | |
| if raw[:4] == b"%PDF": | |
| return True | |
| return "pdf" in (content_type or "").lower() | |
| def ensure_dir(path: str) -> None: | |
| os.makedirs(path, exist_ok=True) | |
| def append_manifest(out_dir: str, record: Dict[str, Any]) -> None: | |
| try: | |
| ensure_dir(out_dir) | |
| with open(os.path.join(out_dir, MANIFEST_NAME), "a", encoding="utf-8") as f: | |
| f.write(json.dumps(record, ensure_ascii=False) + "\n") | |
| except Exception: | |
| pass | |
| def load_manifest(out_dir: str) -> Dict[str, Dict[str, Any]]: | |
| data: Dict[str, Dict[str, Any]] = {} | |
| try: | |
| mpath = os.path.join(out_dir, MANIFEST_NAME) | |
| if not os.path.exists(mpath): | |
| return data | |
| with open(mpath, "r", encoding="utf-8") as f: | |
| for line in f: | |
| try: | |
| rec = json.loads(line) | |
| p = rec.get("path") | |
| sha = rec.get("sha256") | |
| if p: | |
| data[p] = rec | |
| if sha: | |
| data[sha] = rec | |
| except Exception: | |
| continue | |
| except Exception: | |
| pass | |
| return data | |
| # -------------------------------------------------------------------------------------- | |
| # DOWNLOAD SINGLE PDF | |
| # -------------------------------------------------------------------------------------- | |
| def download_pdf( | |
| url: str, | |
| out_dir: str, | |
| suggested_name: Optional[str] = None, | |
| timeout: int = 60, | |
| meta: Optional[Dict[str, Any]] = None, | |
| manifest: Optional[Dict[str, Dict[str, Any]]] = None, | |
| ) -> Optional[str]: | |
| """ | |
| Download a PDF and return local file path, or None on failure. | |
| Deduplicates by SHA256 content hash. | |
| Writes manifest record if meta provided. | |
| """ | |
| try: | |
| headers = {"User-Agent": f"polymer-rag/1.0 ({DEFAULT_MAILTO})"} | |
| with requests.get( | |
| url, headers=headers, timeout=timeout, stream=True, allow_redirects=True | |
| ) as r: | |
| r.raise_for_status() | |
| content_type = r.headers.get("Content-Type", "") | |
| raw = r.content | |
| if not raw or not is_probably_pdf(raw, content_type): | |
| return None | |
| sha = sha256_bytes(raw) | |
| ensure_dir(out_dir) | |
| # Check manifest for existing SHA | |
| if manifest and sha in manifest: | |
| existing_path = manifest[sha].get("path") | |
| if existing_path and os.path.exists(existing_path): | |
| return existing_path | |
| # Check filesystem for existing files with this hash | |
| existing = list(pathlib.Path(out_dir).glob(f"{sha[:16]}*.pdf")) | |
| if existing: | |
| path = str(existing[0]) | |
| if meta: | |
| rec = dict(meta) | |
| rec.update({"sha256": sha, "path": path}) | |
| append_manifest(out_dir, rec) | |
| return path | |
| base = suggested_name or pathlib.Path(url).name or "paper.pdf" | |
| base = safe_filename(base) | |
| if not base.lower().endswith(".pdf"): | |
| base += ".pdf" | |
| fname = f"{sha[:16]}_{base}" | |
| fpath = os.path.join(out_dir, fname) | |
| with open(fpath, "wb") as f: | |
| f.write(raw) | |
| if meta: | |
| rec = dict(meta) | |
| rec.update({"sha256": sha, "path": fpath}) | |
| append_manifest(out_dir, rec) | |
| return fpath | |
| except Exception: | |
| return None | |
| def retry(fn, args, retries=3, sleep=0.6, **kwargs): | |
| for i in range(retries): | |
| out = fn(*args, **kwargs) | |
| if out: | |
| return out | |
| time.sleep(sleep * (2**i)) | |
| return None | |
| def download_one(entry: Union[str, Dict[str, Any]], out_dir: str, manifest: Dict): | |
| if isinstance(entry, dict): | |
| return download_pdf( | |
| entry["url"], | |
| out_dir, | |
| suggested_name=entry.get("name"), | |
| meta=entry.get("meta"), | |
| manifest=manifest, | |
| ) | |
| return download_pdf(entry, out_dir, manifest=manifest) | |
| def parallel_download_pdfs( | |
| entries: List[Union[str, Dict[str, Any]]], | |
| out_dir: str, | |
| manifest: Dict[str, Dict[str, Any]], | |
| max_workers: int = 12, | |
| desc: str = "Downloading PDFs", | |
| ) -> List[str]: | |
| ensure_dir(out_dir) | |
| results: List[str] = [] | |
| if not entries: | |
| return results | |
| with ThreadPoolExecutor(max_workers=max_workers) as ex: | |
| futs = [ex.submit(retry, download_one, (e, out_dir, manifest)) for e in entries] | |
| for f in tqdm(as_completed(futs), total=len(futs), desc=desc): | |
| p = f.result() | |
| if p: | |
| results.append(p) | |
| return results | |
| # -------------------------------------------------------------------------------------- | |
| # ARXIV | |
| # -------------------------------------------------------------------------------------- | |
| def arxiv_query_from_keywords(keywords: List[str]) -> str: | |
| kw = [k.replace(" ", "+") for k in keywords] | |
| terms = " OR ".join([f"ti:{k}" for k in kw] + [f"abs:{k}" for k in kw]) | |
| cats = ( | |
| "cat:cond-mat.mtrl-sci OR cat:cond-mat.soft OR cat:physics.chem-ph OR cat:cs.LG OR cat:stat.ML" | |
| ) | |
| return f"({terms}) AND ({cats})" | |
| def fetch_arxiv_pdf_urls(keywords: List[str], max_results: int = 800) -> List[str]: | |
| """ | |
| Extract explicit pdf links and fallback to building from id entries. | |
| """ | |
| query = arxiv_query_from_keywords(keywords) | |
| params = { | |
| "search_query": query, | |
| "start": 0, | |
| "max_results": max_results, | |
| "sortBy": "submittedDate", | |
| "sortOrder": "descending", | |
| } | |
| headers = {"User-Agent": f"polymer-rag/1.0 ({DEFAULT_MAILTO})"} | |
| try: | |
| resp = requests.get(ARXIV_SEARCH_URL, params=params, headers=headers, timeout=60) | |
| resp.raise_for_status() | |
| xml = resp.text | |
| except Exception: | |
| return [] | |
| pdfs: List[str] = [] | |
| seen = set() | |
| # explicit pdf hrefs | |
| for p in re.findall(r'href="(https?://arxiv\.org/pdf[^"]*)"', xml): | |
| if p not in seen: | |
| pdfs.append(p) | |
| seen.add(p) | |
| # fallback: build from id entries | |
| for aid in re.findall(r'<id>(https?://arxiv\.org/abs[^<]*)</id>', xml): | |
| m = re.search(r"arxiv\.org/abs/([^?v]+)", aid) | |
| if m: | |
| identifier = m.group(1) | |
| pdf = f"https://arxiv.org/pdf/{identifier}.pdf" | |
| if pdf not in seen: | |
| pdfs.append(pdf) | |
| seen.add(pdf) | |
| return pdfs | |
| def fetch_arxiv_pdfs( | |
| keywords: List[str], | |
| out_dir: str, | |
| manifest: Dict[str, Dict[str, Any]], | |
| max_results: int = 800, | |
| ) -> List[str]: | |
| urls = fetch_arxiv_pdf_urls(keywords, max_results=max_results) | |
| entries = [ | |
| { | |
| "url": u, | |
| "name": u.rstrip("/").split("/")[-1], | |
| "meta": {"source": "arxiv", "url": u}, | |
| } | |
| for u in urls | |
| ] | |
| paths = parallel_download_pdfs(entries, out_dir, manifest, max_workers=8, desc="arXiv PDFs") | |
| return paths | |
| # -------------------------------------------------------------------------------------- | |
| # OPENALEX | |
| # -------------------------------------------------------------------------------------- | |
| def openalex_fetch_works_try( | |
| search: str, | |
| filter_str: str, | |
| per_page: int, | |
| page: int, | |
| mailto: Optional[str], | |
| ) -> Dict[str, Any]: | |
| headers = {"User-Agent": f"polymer-rag/1.0 ({mailto or DEFAULT_MAILTO})"} | |
| params: Dict[str, Any] = { | |
| "search": search, | |
| "per-page": per_page, | |
| "per_page": per_page, | |
| "page": page, | |
| "sort": "publication_date:desc", | |
| } | |
| if filter_str: | |
| params["filter"] = filter_str | |
| if mailto: | |
| params["mailto"] = mailto | |
| resp = requests.get(OPENALEX_WORKS_URL, params=params, headers=headers, timeout=60) | |
| resp.raise_for_status() | |
| return resp.json() | |
| def openalex_fetch_works( | |
| keywords: List[str], | |
| max_results: int = 600, | |
| per_page: int = 200, | |
| mailto: Optional[str] = None, | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Try multiple query forms with relaxed filters if needed. | |
| """ | |
| kws = sorted(set(keywords or []), key=str.lower) | |
| combined = " ".join(kws) | |
| or_query = " OR ".join(kws) | |
| attempts = [ | |
| {"q": combined, "filter": "is_oa:true,language:en"}, | |
| {"q": or_query, "filter": "is_oa:true,language:en"}, | |
| {"q": or_query, "filter": "is_oa:true"}, | |
| {"q": or_query, "filter": ""}, | |
| ] | |
| works: List[Dict[str, Any]] = [] | |
| for attempt in attempts: | |
| search = attempt["q"] | |
| filter_str = attempt["filter"] | |
| page = 1 | |
| while len(works) < max_results: | |
| try: | |
| data = openalex_fetch_works_try( | |
| search, filter_str, per_page, page, mailto or DEFAULT_MAILTO | |
| ) | |
| except Exception as e: | |
| print(f"[WARN] OpenAlex request failed: {e}") | |
| break | |
| results = data.get("results", []) | |
| if not results: | |
| break | |
| works.extend(results) | |
| if len(results) < per_page: | |
| break | |
| page += 1 | |
| time.sleep(0.12) | |
| if len(works) >= max_results: | |
| break | |
| if works: | |
| break | |
| return works[:max_results] | |
| def openalex_extract_pdf_entries( | |
| works: List[Dict[str, Any]], | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Extract candidate PDF URLs and metadata from OpenAlex works. | |
| """ | |
| out: List[Dict[str, Any]] = [] | |
| seen_urls = set() | |
| for w in works: | |
| pdf = "" | |
| best = w.get("best_oa_location") or {} | |
| if isinstance(best, dict): | |
| pdf = best.get("pdf_url") or best.get("url_for_pdf") or best.get("url") or "" | |
| if not pdf: | |
| pl = w.get("primary_location") or {} | |
| if isinstance(pl, dict): | |
| pdf = ( | |
| pl.get("pdf_url") | |
| or pl.get("url_for_pdf") | |
| or pl.get("landing_page_url") | |
| or "" | |
| ) | |
| if not pdf: | |
| oa = w.get("open_access") or {} | |
| if isinstance(oa, dict): | |
| pdf = oa.get("oa_url") or oa.get("oa_url_for_pdf") or "" | |
| if not pdf or pdf in seen_urls: | |
| continue | |
| seen_urls.add(pdf) | |
| title = (w.get("title") or w.get("display_name") or "").strip() | |
| year = w.get("publication_year") or w.get("publication_date") or "" | |
| venue = "" | |
| pl = w.get("primary_location") or {} | |
| if isinstance(pl, dict): | |
| venue = (pl.get("source") or {}).get("display_name") or "" | |
| if not venue: | |
| venue = (w.get("host_venue") or {}).get("display_name") or "".strip() | |
| name = " - ".join([s for s in [title, venue, str(year) or ""] if s]) | |
| meta = {"title": title, "year": year, "venue": venue, "source": "openalex"} | |
| out.append({"url": pdf, "name": name, "meta": meta}) | |
| return out | |
| def fetch_openalex_pdfs( | |
| keywords: List[str], | |
| out_dir: str, | |
| manifest: Dict[str, Dict[str, Any]], | |
| max_results: int = 600, | |
| mailto: Optional[str] = None, | |
| ) -> List[str]: | |
| works = openalex_fetch_works(keywords, max_results=max_results, mailto=mailto) | |
| if not works: | |
| print("[INFO] OpenAlex returned no works for given queries/filters.") | |
| return [] | |
| entries = openalex_extract_pdf_entries(works) | |
| if not entries: | |
| print("[INFO] OpenAlex works found, but no PDF links extracted.") | |
| return [] | |
| paths = parallel_download_pdfs( | |
| entries, out_dir, manifest, max_workers=16, desc="OpenAlex PDFs" | |
| ) | |
| return paths | |
| # -------------------------------------------------------------------------------------- | |
| # EUROPE PMC | |
| # -------------------------------------------------------------------------------------- | |
| def epmc_query_from_keywords(keywords: List[str]) -> str: | |
| return " OR ".join([f'"{k}"' for k in keywords]) | |
| def epmc_extract_pdf_entries_from_results( | |
| results: List[Dict[str, Any]], | |
| ) -> List[Dict[str, Any]]: | |
| out: List[Dict[str, Any]] = [] | |
| seen = set() | |
| for r in results: | |
| ftl = r.get("fullTextUrlList") or {} | |
| urls: List[str] = [] | |
| if isinstance(ftl, dict): | |
| for ful in ftl.get("fullTextUrl") or []: | |
| if isinstance(ful, dict): | |
| u = ful.get("url") or "" | |
| if u: | |
| urls.append(u) | |
| if not urls: | |
| fu = r.get("fullTextUrl") | |
| if isinstance(fu, str) and fu: | |
| urls.append(fu) | |
| for u in urls: | |
| if not u or u in seen: | |
| continue | |
| seen.add(u) | |
| title = r.get("title") or "".strip() | |
| year = r.get("firstPublicationDate") or r.get("pubYear") or "" | |
| name = " - ".join([s for s in [title, str(year) or ""] if s]) | |
| out.append( | |
| { | |
| "url": u, | |
| "name": name, | |
| "meta": {"title": title, "year": year, "source": "epmc"}, | |
| } | |
| ) | |
| return out | |
| def fetch_epmc_pdfs( | |
| keywords: List[str], | |
| out_dir: str, | |
| manifest: Dict[str, Dict[str, Any]], | |
| max_results: int = 200, | |
| page_size: int = 25, | |
| ) -> List[str]: | |
| """ | |
| Query Europe PMC and extract fullTextUrlList entries. | |
| """ | |
| q = epmc_query_from_keywords(keywords) | |
| params = { | |
| "query": q, | |
| "format": "json", | |
| "pageSize": page_size, | |
| "sort": "FIRST_PDATE desc", | |
| } | |
| headers = {"User-Agent": f"polymer-rag/1.0 ({DEFAULT_MAILTO})"} | |
| saved: List[str] = [] | |
| cursor = 1 | |
| total_fetched = 0 | |
| while total_fetched < max_results: | |
| params["page"] = cursor | |
| try: | |
| resp = requests.get(EPMC_SEARCH_URL, params=params, headers=headers, timeout=30) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| except Exception as e: | |
| print(f"[WARN] Europe PMC request failed: {e}") | |
| break | |
| results = (data.get("resultList") or {}).get("result") or [] | |
| if not results: | |
| break | |
| entries = epmc_extract_pdf_entries_from_results(results) | |
| if not entries: | |
| cursor += 1 | |
| total_fetched += len(results) | |
| time.sleep(0.2) | |
| continue | |
| paths = parallel_download_pdfs(entries, out_dir, manifest, max_workers=8, desc="Europe PMC PDFs") | |
| saved.extend(paths) | |
| total_fetched += len(results) | |
| cursor += 1 | |
| time.sleep(0.2) | |
| return saved | |
| # -------------------------------------------------------------------------------------- | |
| # POLYMER JOURNALS OA | |
| # -------------------------------------------------------------------------------------- | |
| def fetch_polymer_journal_pdfs( | |
| journal_queries: List[Dict[str, Any]], | |
| out_dir: str, | |
| manifest: Dict[str, Dict[str, Any]], | |
| max_per_journal: int = 50, | |
| mailto: Optional[str] = None, | |
| ) -> List[str]: | |
| """ | |
| Fetch OA papers from specific polymer journals via OpenAlex. | |
| """ | |
| all_paths: List[str] = [] | |
| for jq in journal_queries: | |
| journal_name = jq["journal"] | |
| issn = jq.get("issn", "") | |
| publisher = jq.get("publisher", "") | |
| print(f"→ Fetching from {journal_name} ({publisher})...") | |
| # Build OpenAlex filter for this journal | |
| filter_parts = ["is_oa:true", "language:en"] | |
| if issn: | |
| filter_parts.append(f"primary_location.source.issn:{issn}") | |
| filter_str = ",".join(filter_parts) | |
| # Search for polymer-related content in this journal | |
| search_query = "polymer OR macromolecule OR copolymer" | |
| page = 1 | |
| journal_works = [] | |
| while len(journal_works) < max_per_journal: | |
| try: | |
| data = openalex_fetch_works_try( | |
| search_query, filter_str, 25, page, mailto or DEFAULT_MAILTO | |
| ) | |
| except Exception as e: | |
| print(f"[WARN] Failed to fetch {journal_name}: {e}") | |
| break | |
| results = data.get("results", []) | |
| if not results: | |
| break | |
| journal_works.extend(results) | |
| if len(results) < 25: | |
| break | |
| page += 1 | |
| time.sleep(0.15) | |
| if journal_works: | |
| entries = openalex_extract_pdf_entries(journal_works[:max_per_journal]) | |
| # Tag with journal source | |
| for e in entries: | |
| e["meta"]["journal"] = journal_name | |
| e["meta"]["publisher"] = publisher | |
| e["meta"]["source"] = f"{journal_name}_{publisher}".lower() | |
| paths = parallel_download_pdfs( | |
| entries, out_dir, manifest, max_workers=8, desc=f"{journal_name} PDFs" | |
| ) | |
| all_paths.extend(paths) | |
| print(f" → Downloaded {len(paths)} PDFs from {journal_name}") | |
| time.sleep(0.3) | |
| return all_paths | |
| # -------------------------------------------------------------------------------------- | |
| # WRAPPER FOR OPENAI EMBEDDINGS | |
| # -------------------------------------------------------------------------------------- | |
| class PolymerStyleOpenAIEmbeddings(OpenAIEmbeddings): | |
| """ | |
| OpenAI embeddings wrapper for polymer RAG. | |
| Default model: text-embedding-3-small (1536-D) ← FIXED | |
| """ | |
| def __init__(self, model: str = "text-embedding-3-small", **kwargs): | |
| super().__init__(model=model, **kwargs) | |
| # -------------------------------------------------------------------------------------- | |
| # TOKENIZER FOR TRUE TOKEN-BASED SEGMENTATION | |
| # -------------------------------------------------------------------------------------- | |
| TOKENIZER = tiktoken.get_encoding("cl100k_base") | |
| def token_length(text: str) -> int: | |
| if not text: | |
| return 0 | |
| return len(TOKENIZER.encode(text)) | |
| # -------------------------------------------------------------------------------------- | |
| # METADATA ENRICHMENT FROM MANIFEST | |
| # -------------------------------------------------------------------------------------- | |
| def attach_extra_metadata_from_manifest( | |
| docs: List[Any], manifest: Dict[str, Dict[str, Any]] | |
| ) -> None: | |
| """ | |
| Enrich Document metadata with manifest data for later citation. | |
| """ | |
| for d in docs: | |
| src_path = d.metadata.get("source", "") | |
| if not src_path: | |
| continue | |
| rec = manifest.get(src_path) | |
| if not rec: | |
| for k, v in manifest.items(): | |
| if os.path.basename(k) == os.path.basename(src_path): | |
| rec = v | |
| break | |
| if rec: | |
| for k in ["title", "year", "venue", "url", "source", "journal", "publisher"]: | |
| if k in rec: | |
| d.metadata[k] = rec[k] | |
| # -------------------------------------------------------------------------------------- | |
| # MULTI-SCALE CHUNKING | |
| # -------------------------------------------------------------------------------------- | |
| def multiscale_chunk_documents( | |
| docs: List[Any], min_chunk_tokens: int = 32 | |
| ) -> List[Any]: | |
| """ | |
| Multi-scale segmentation at TOKEN level: 512, 256, 128 token windows. | |
| """ | |
| splitter_specs = [ | |
| ("tokens=512", 512, 64), # 50% tokens overlap | |
| ("tokens=256", 256, 48), | |
| ("tokens=128", 128, 32), | |
| ] | |
| all_chunks: List[Any] = [] | |
| seg_id = 0 | |
| for scale_label, chunk_size, overlap in splitter_specs: | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=overlap, | |
| length_function=token_length, | |
| separators=["\n\n", "\n", ". ", " ", ""], | |
| ) | |
| splits = splitter.split_documents(docs) | |
| for d in splits: | |
| if token_length(d.page_content or "") < min_chunk_tokens: | |
| continue | |
| d.metadata = dict(d.metadata or {}) | |
| d.metadata["segment_scale"] = scale_label | |
| d.metadata["segment_id"] = seg_id | |
| seg_id += 1 | |
| all_chunks.append(d) | |
| return all_chunks | |
| # -------------------------------------------------------------------------------------- | |
| # BUILD RETRIEVER FROM LOCAL PDFs | |
| # -------------------------------------------------------------------------------------- | |
| def _split_and_build_retriever( | |
| documents_dir: str, | |
| persist_dir: Optional[str] = None, | |
| k: int = 10, | |
| embedding_model: str = "text-embedding-3-small", | |
| vector_backend: str = "chroma", | |
| min_chunk_tokens: int = 32, | |
| api_key: Optional[str] = None, | |
| ): | |
| """ | |
| Load PDFs, chunk multi-scale, build dense retriever. | |
| FIXED: Always uses text-embedding-3-small (1536-D) and handles existing DB correctly. | |
| """ | |
| print(f"→ Loading PDFs from {documents_dir}...") | |
| try: | |
| loader = DirectoryLoader( | |
| documents_dir, | |
| glob="*.pdf", | |
| loader_cls=PyPDFLoader, | |
| show_progress=True, | |
| use_multithreading=True, | |
| silent_errors=True, | |
| ) | |
| except TypeError: | |
| loader = DirectoryLoader( | |
| documents_dir, | |
| glob="*.pdf", | |
| loader_cls=PyPDFLoader, | |
| show_progress=True, | |
| use_multithreading=True, | |
| ) | |
| docs = loader.load() | |
| if not docs: | |
| raise RuntimeError("No PDF documents found to index.") | |
| manifest = load_manifest(documents_dir) | |
| attach_extra_metadata_from_manifest(docs, manifest) | |
| documents = multiscale_chunk_documents(docs, min_chunk_tokens=min_chunk_tokens) | |
| print( | |
| f"→ Created {len(documents)} multi-scale segments from {len(docs)} PDFs (512/256/128-token windows)." | |
| ) | |
| print(f"→ Using OpenAI embeddings model: {embedding_model}") | |
| embeddings = PolymerStyleOpenAIEmbeddings(model=embedding_model, api_key=api_key) | |
| if vector_backend.lower() == "chroma": | |
| if persist_dir and os.path.exists(persist_dir): | |
| print(f"→ Deleting existing Chroma database at {persist_dir} to prevent dimension mismatch...") | |
| import shutil | |
| shutil.rmtree(persist_dir) | |
| print(f"→ Existing database deleted. Creating fresh database...") | |
| # Sanitize all text content to prevent Unicode errors | |
| for doc in documents: | |
| doc.page_content = sanitize_text(doc.page_content or "") | |
| for key, value in doc.metadata.items(): | |
| if isinstance(value, str): | |
| doc.metadata[key] = sanitize_text(value) | |
| # Process in batches to avoid rate limiting and memory issues | |
| batch_size = 500 # Adjust based on your document sizes (500 is safe for most cases) | |
| total_batches = (len(documents) + batch_size - 1) // batch_size | |
| print(f"→ Processing {len(documents)} documents in {total_batches} batches of {batch_size}...") | |
| vector_store = None | |
| for i in range(0, len(documents), batch_size): | |
| batch = documents[i : i + batch_size] | |
| batch_num = (i // batch_size) + 1 | |
| print(f" → Embedding batch {batch_num}/{total_batches} ({len(batch)} documents)...") | |
| if vector_store is None: | |
| # First batch: create the vector store | |
| if persist_dir: | |
| print(f" → Creating new Chroma database at {persist_dir}") | |
| vector_store = Chroma.from_documents( | |
| batch, embeddings, persist_directory=persist_dir | |
| ) | |
| else: | |
| # In-memory mode also needs batching | |
| vector_store = Chroma.from_documents(batch, embeddings) | |
| else: | |
| # Subsequent batches: add to existing store | |
| vector_store.add_documents(batch) | |
| time.sleep(0.5) # Small delay to avoid rate limiting | |
| elif vector_backend.lower() == "faiss": | |
| try: | |
| from langchain_community.vectorstores import FAISS | |
| except Exception as e: | |
| raise RuntimeError("FAISS requested but not available") from e | |
| # Sanitize all text content | |
| for doc in documents: | |
| doc.page_content = sanitize_text(doc.page_content or "") | |
| for key, value in doc.metadata.items(): | |
| if isinstance(value, str): | |
| doc.metadata[key] = sanitize_text(value) | |
| # FAISS also needs batching | |
| batch_size = 500 | |
| total_batches = (len(documents) + batch_size - 1) // batch_size | |
| print(f"→ Processing {len(documents)} documents in {total_batches} batches of {batch_size}...") | |
| vector_store = None | |
| for i in range(0, len(documents), batch_size): | |
| batch = documents[i : i + batch_size] | |
| batch_num = (i // batch_size) + 1 | |
| print(f" → Embedding batch {batch_num}/{total_batches} ({len(batch)} documents)...") | |
| if vector_store is None: | |
| vector_store = FAISS.from_documents(batch, embeddings) | |
| else: | |
| batch_store = FAISS.from_documents(batch, embeddings) | |
| vector_store.merge_from(batch_store) | |
| time.sleep(0.5) | |
| else: | |
| raise ValueError("vector_backend must be 'chroma' or 'faiss'") | |
| vector_retriever = vector_store.as_retriever(search_kwargs={"k": k}) | |
| print("→ RAG KB ready (dense retriever over multi-scale segments).") | |
| return vector_retriever | |
| # -------------------------------------------------------------------------------------- | |
| # PUBLIC API: BUILD RETRIEVER FROM WEB | |
| # -------------------------------------------------------------------------------------- | |
| def build_retriever_from_web( | |
| polymer_keywords: Optional[List[str]] = None, | |
| target_curated: int = TARGET_CURATED, | |
| target_journals: int = TARGET_JOURNALS, | |
| target_arxiv: int = TARGET_ARXIV, | |
| target_openalex: int = TARGET_OPENALEX, | |
| target_epmc: int = TARGET_EPMC, | |
| extra_pdf_urls: Optional[List[str]] = None, | |
| persist_dir: str = DEFAULT_PERSIST_DIR, | |
| tmp_download_dir: str = DEFAULT_TMP_DOWNLOAD_DIR, | |
| k: int = 10, | |
| embedding_model: str = "text-embedding-3-small", | |
| vector_backend: str = "chroma", | |
| mailto: Optional[str] = None, | |
| include_curated: bool = True, | |
| ): | |
| """ | |
| Fetch balanced polymer corpus across multiple sources. | |
| Target distribution (~2000 PDFs): | |
| - Curated guidelines/standards: 100 | |
| - Polymer journals OA: 200 | |
| - arXiv: 800 | |
| - OpenAlex: 600 | |
| - Europe PMC: 200 | |
| - Extra/databases: 100 | |
| """ | |
| polymer_keywords = sorted(set(polymer_keywords or POLYMER_KEYWORDS), key=str.lower) | |
| print("=" * 70) | |
| print("Fetching polymer PDFs from balanced sources...") | |
| print( | |
| f"Target: {target_curated} curated + {target_journals} journals + " | |
| f"{target_arxiv} arXiv + {target_openalex} OpenAlex + {target_epmc} EPMC" | |
| ) | |
| ensure_dir(tmp_download_dir) | |
| manifest = load_manifest(tmp_download_dir) | |
| source_stats = defaultdict(int) | |
| all_paths: List[str] = [] | |
| # -------------------------------------------------------------------------------------- | |
| # 1) Curated sources (IUPAC, ISO/ASTM, polymer informatics reviews) | |
| # -------------------------------------------------------------------------------------- | |
| if include_curated and CURATED_POLYMER_PDF_SOURCES: | |
| print(f"[1/6] Downloading {len(CURATED_POLYMER_PDF_SOURCES)} curated PDFs...") | |
| curated_paths = parallel_download_pdfs( | |
| CURATED_POLYMER_PDF_SOURCES[:target_curated], | |
| tmp_download_dir, | |
| manifest, | |
| max_workers=4, | |
| desc="Curated PDFs", | |
| ) | |
| for p in curated_paths: | |
| if p not in all_paths: | |
| all_paths.append(p) | |
| source_stats["curated"] += 1 | |
| print(f" → {len(curated_paths)} curated PDFs downloaded") | |
| # -------------------------------------------------------------------------------------- | |
| # 2) Polymer journals OA | |
| # -------------------------------------------------------------------------------------- | |
| try: | |
| print(f"[2/6] Fetching polymer journal PDFs (target: {target_journals})...") | |
| journal_paths = fetch_polymer_journal_pdfs( | |
| POLYMER_JOURNAL_QUERIES, | |
| tmp_download_dir, | |
| manifest, | |
| max_per_journal=target_journals // len(POLYMER_JOURNAL_QUERIES) + 1, | |
| mailto=mailto, | |
| ) | |
| for p in journal_paths: | |
| if p not in all_paths: | |
| all_paths.append(p) | |
| source_stats["journal"] += 1 | |
| print(f" → {len(journal_paths)} journal PDFs downloaded") | |
| except Exception as e: | |
| print(f"[WARN] Polymer journal fetch error: {e}") | |
| # -------------------------------------------------------------------------------------- | |
| # 3) arXiv polymer-focused categories | |
| # -------------------------------------------------------------------------------------- | |
| try: | |
| print(f"[3/6] Fetching arXiv PDFs (target: {target_arxiv})...") | |
| arxiv_paths = fetch_arxiv_pdfs( | |
| polymer_keywords, tmp_download_dir, manifest, max_results=target_arxiv | |
| ) | |
| for p in arxiv_paths: | |
| if p not in all_paths: | |
| all_paths.append(p) | |
| source_stats["arxiv"] += 1 | |
| print(f" → {len(arxiv_paths)} arXiv PDFs downloaded") | |
| except Exception as e: | |
| print(f"[WARN] arXiv fetch error: {e}") | |
| # -------------------------------------------------------------------------------------- | |
| # 4) OpenAlex broad polymer search | |
| # -------------------------------------------------------------------------------------- | |
| try: | |
| print(f"[4/6] Fetching OpenAlex PDFs (target: {target_openalex})...") | |
| openalex_paths = fetch_openalex_pdfs( | |
| polymer_keywords, | |
| tmp_download_dir, | |
| manifest, | |
| max_results=target_openalex, | |
| mailto=mailto, | |
| ) | |
| for p in openalex_paths: | |
| if p not in all_paths: | |
| all_paths.append(p) | |
| source_stats["openalex"] += 1 | |
| print(f" → {len(openalex_paths)} OpenAlex PDFs downloaded") | |
| except Exception as e: | |
| print(f"[WARN] OpenAlex fetch error: {e}") | |
| # -------------------------------------------------------------------------------------- | |
| # 5) Europe PMC biopolymers/materials | |
| # -------------------------------------------------------------------------------------- | |
| try: | |
| print(f"[5/6] Fetching Europe PMC PDFs (target: {target_epmc})...") | |
| epmc_paths = fetch_epmc_pdfs( | |
| polymer_keywords, tmp_download_dir, manifest, max_results=target_epmc | |
| ) | |
| for p in epmc_paths: | |
| if p not in all_paths: | |
| all_paths.append(p) | |
| source_stats["epmc"] += 1 | |
| print(f" → {len(epmc_paths)} Europe PMC PDFs downloaded") | |
| except Exception as e: | |
| print(f"[WARN] Europe PMC fetch error: {e}") | |
| # -------------------------------------------------------------------------------------- | |
| # 6) Extra URLs (user-provided, database exports, etc.) | |
| # -------------------------------------------------------------------------------------- | |
| if extra_pdf_urls: | |
| print(f"[6/6] Downloading {len(extra_pdf_urls)} extra PDFs...") | |
| extra_entries = [ | |
| {"url": u, "name": None, "meta": {"url": u, "source": "extra"}} | |
| for u in extra_pdf_urls | |
| ] | |
| extra_paths = parallel_download_pdfs( | |
| extra_entries, tmp_download_dir, manifest, max_workers=8, desc="Extra PDFs" | |
| ) | |
| for p in extra_paths: | |
| if p not in all_paths: | |
| all_paths.append(p) | |
| source_stats["extra"] += 1 | |
| print(f" → {len(extra_paths)} extra PDFs downloaded") | |
| # -------------------------------------------------------------------------------------- | |
| # Summary | |
| # -------------------------------------------------------------------------------------- | |
| total = len(all_paths) | |
| print("=" * 70) | |
| print("DOWNLOAD SUMMARY") | |
| print("=" * 70) | |
| print(f"Total unique PDFs downloaded: {total}") | |
| print(" by source:") | |
| for source, count in sorted(source_stats.items()): | |
| pct = (count / total * 100) if total > 0 else 0 | |
| print(f" {source:20s} {count:4d} PDFs ({pct:5.1f}%)") | |
| print("=" * 70) | |
| if total == 0: | |
| raise RuntimeError( | |
| "No PDFs fetched. Adjust keywords, targets, or add extra_pdf_urls." | |
| ) | |
| print("Building knowledge base from downloaded PDFs...") | |
| retriever = _split_and_build_retriever( | |
| documents_dir=tmp_download_dir, | |
| persist_dir=persist_dir, | |
| k=k, | |
| embedding_model=embedding_model, | |
| vector_backend=vector_backend, | |
| ) | |
| return retriever | |
| # -------------------------------------------------------------------------------------- | |
| # PUBLIC API: BUILD RETRIEVER FROM LOCAL PAPERS | |
| # -------------------------------------------------------------------------------------- | |
| def build_retriever( | |
| papers_path: str, | |
| persist_dir: Optional[str] = DEFAULT_PERSIST_DIR, | |
| k: int = 10, | |
| embedding_model: str = "text-embedding-3-small", | |
| vector_backend: str = "chroma", | |
| ): | |
| """ | |
| Build polymer RAG KB from local PDFs. | |
| """ | |
| print("Building RAG knowledge base from local PDFs...") | |
| return _split_and_build_retriever( | |
| documents_dir=papers_path, | |
| persist_dir=persist_dir, | |
| k=k, | |
| embedding_model=embedding_model, | |
| vector_backend=vector_backend, | |
| ) | |
| # -------------------------------------------------------------------------------------- | |
| # CONVENIENCE WRAPPER: POLYMER FOUNDATION MODELS | |
| # -------------------------------------------------------------------------------------- | |
| def build_retriever_polymer_foundation_models( | |
| persist_dir: str = DEFAULT_PERSIST_DIR, | |
| k: int = 10, | |
| vector_backend: str = "chroma", | |
| ): | |
| """ | |
| Convenience wrapper for polymer foundation model corpus. | |
| """ | |
| fm_kw = list( | |
| set(POLYMER_KEYWORDS) | |
| | { | |
| "BigSMILES", | |
| "PSMILES", | |
| "polymer SMILES", | |
| "polymer language model", | |
| "foundation model polymer", | |
| "masked language model polymer", | |
| "self-supervised polymer", | |
| "generative polymer", | |
| "polymer sequence modeling", | |
| "representation learning polymer", | |
| } | |
| ) | |
| return build_retriever_from_web( | |
| polymer_keywords=fm_kw, | |
| target_curated=100, | |
| target_journals=200, | |
| target_arxiv=800, | |
| target_openalex=600, | |
| target_epmc=200, | |
| persist_dir=persist_dir, | |
| k=k, | |
| embedding_model="text-embedding-3-small", | |
| vector_backend=vector_backend, | |
| ) | |
| # -------------------------------------------------------------------------------------- | |
| # MAIN | |
| # -------------------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| retriever = build_retriever_from_web( | |
| polymer_keywords=POLYMER_KEYWORDS, | |
| target_curated=100, | |
| target_journals=200, | |
| target_arxiv=800, | |
| target_openalex=600, | |
| target_epmc=200, | |
| persist_dir="chroma_polymer_db_balanced", | |
| tmp_download_dir=DEFAULT_TMP_DOWNLOAD_DIR, | |
| k=10, | |
| embedding_model="text-embedding-3-small", | |
| vector_backend="chroma", | |
| mailto=DEFAULT_MAILTO, | |
| include_curated=True, | |
| ) | |
| print("\n" + "=" * 70) | |
| print("Testing retrieval with sample query") | |
| docs = retriever.get_relevant_documents("PSMILES polymer electrolyte design") | |
| for i, d in enumerate(docs, 1): | |
| meta = d.metadata or {} | |
| title = meta.get("title") or os.path.basename(meta.get("source", "")) or "document" | |
| year = meta.get("year", "") | |
| src = meta.get("source", "unknown") | |
| journal = meta.get("journal", "") | |
| scale = meta.get("segment_scale", "") | |
| source_str = f"{src}" | |
| if journal: | |
| source_str = f"{journal} ({src})" | |
| print(f"\n[{i}] {title}") | |
| print(f" Year: {year} | Source: {source_str} | Scale: {scale}") | |
| print(f" Content: {(d.page_content or '')[:200]}...") | |