Spaces:

sobinalosious92
/

POLYMER-PROPERTY

Sleeping

App Files Files Community

sobinalosious92 commited on Mar 24

Commit

3f4ebee

verified ·

1 Parent(s): a22718f

Upload 119 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

literature/__init__.py +100 -0
literature/__pycache__/__init__.cpython-310.pyc +0 -0
literature/__pycache__/__init__.cpython-313.pyc +0 -0
literature/__pycache__/clarifier.cpython-310.pyc +0 -0
literature/__pycache__/clarifier.cpython-313.pyc +0 -0
literature/__pycache__/config.cpython-310.pyc +0 -0
literature/__pycache__/config.cpython-313.pyc +0 -0
literature/__pycache__/converters.cpython-310.pyc +0 -0
literature/__pycache__/converters.cpython-313.pyc +0 -0
literature/__pycache__/discovery.cpython-310.pyc +0 -0
literature/__pycache__/discovery.cpython-313.pyc +0 -0
literature/__pycache__/evaluation.cpython-310.pyc +0 -0
literature/__pycache__/extraction.cpython-310.pyc +0 -0
literature/__pycache__/extraction.cpython-313.pyc +0 -0
literature/__pycache__/graph.cpython-313.pyc +0 -0
literature/__pycache__/property_registry.cpython-310.pyc +0 -0
literature/__pycache__/property_registry.cpython-313.pyc +0 -0
literature/__pycache__/quality.cpython-310.pyc +0 -0
literature/__pycache__/quality.cpython-313.pyc +0 -0
literature/__pycache__/retrieval.cpython-310.pyc +0 -0
literature/__pycache__/retrieval.cpython-313.pyc +0 -0
literature/__pycache__/schemas.cpython-310.pyc +0 -0
literature/__pycache__/schemas.cpython-313.pyc +0 -0
literature/__pycache__/standardizer.cpython-310.pyc +0 -0
literature/__pycache__/standardizer.cpython-313.pyc +0 -0
literature/clarifier.py +89 -0
literature/config.py +71 -0
literature/converters.py +56 -0
literature/discovery.py +380 -0
literature/evaluation.py +155 -0
literature/extraction.py +863 -0
literature/graph.py +450 -0
literature/property_registry.py +274 -0
literature/quality.py +176 -0
literature/retrieval.py +398 -0
literature/schemas.py +329 -0
literature/standardizer.py +211 -0
scripts/__pycache__/run_literature_mining.cpython-313.pyc +0 -0
scripts/evaluate_polyie.py +29 -0
scripts/run_literature_mining.py +149 -0
scripts/train_prior_slurm.sh +38 -0
src/.DS_Store +0 -0
src/__pycache__/conv.cpython-310.pyc +0 -0
src/__pycache__/conv.cpython-313.pyc +0 -0
src/__pycache__/data_builder.cpython-310.pyc +0 -0
src/__pycache__/data_builder.cpython-313.pyc +0 -0
src/__pycache__/discover_llm.cpython-310.pyc +0 -0
src/__pycache__/discover_llm.cpython-313.pyc +0 -0
src/__pycache__/discovery.cpython-310.pyc +0 -0
src/__pycache__/discovery.cpython-313.pyc +0 -0

literature/__init__.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Literature mining package for project-based extraction workflows."""
+from .schemas import (
+    ContextualizedValue,
+    DataQuality,
+    ExperimentalConditions,
+    ExtractionResult,
+    LiteratureEvidenceRecord,
+    LiteratureQuerySpec,
+    LiteratureSupportSummary,
+    PaperMetadata,
+    PaperCardResult,
+    PaperSource,
+    PolymerDataPoint,
+    QueryMode,
+    ReviewStatus,
+)
+from .property_registry import (
+    PROPERTY_CATALOG,
+    PLATFORM_PROPERTY_KEYS,
+    TEMPLATES,
+    TEMPLATE_LABELS,
+    build_extraction_prompt,
+    detect_property_keys,
+    normalize_property_key,
+    property_display_name,
+)
+from .quality import QualityAssessor, QualityReport
+from .standardizer import StandardizationResult, UnitStandardizer, normalize_minus_signs
+from .clarifier import ClarifierAgent, QueryAnalysis
+from .evaluation import evaluate_predictions, load_json_records
+try:
+    from .config import LiteratureConfig, get_config
+except Exception:  # pragma: no cover - optional runtime dependency
+    LiteratureConfig = None  # type: ignore
+    get_config = None  # type: ignore
+try:
+    from .discovery import PaperDiscoveryAgent
+except Exception:  # pragma: no cover - optional runtime dependency
+    PaperDiscoveryAgent = None  # type: ignore
+try:
+    from .retrieval import PDFRetriever, extract_text_from_pdf
+except Exception:  # pragma: no cover - optional runtime dependency
+    PDFRetriever = None  # type: ignore
+    extract_text_from_pdf = None  # type: ignore
+try:
+    from .extraction import ContextualizedExtractor, DataExtractor
+except Exception:  # pragma: no cover - optional runtime dependency
+    ContextualizedExtractor = None  # type: ignore
+    DataExtractor = None  # type: ignore
+try:
+    from .converters import to_experiment_result
+except Exception:  # pragma: no cover - optional runtime dependency
+    to_experiment_result = None  # type: ignore
+__all__ = [
+    "LiteratureConfig",
+    "get_config",
+    "PaperMetadata",
+    "PaperSource",
+    "PolymerDataPoint",
+    "ExtractionResult",
+    "DataQuality",
+    "ContextualizedValue",
+    "ExperimentalConditions",
+    "LiteratureQuerySpec",
+    "PaperCardResult",
+    "LiteratureEvidenceRecord",
+    "LiteratureSupportSummary",
+    "QueryMode",
+    "ReviewStatus",
+    "PaperDiscoveryAgent",
+    "PDFRetriever",
+    "extract_text_from_pdf",
+    "DataExtractor",
+    "ContextualizedExtractor",
+    "QualityAssessor",
+    "QualityReport",
+    "UnitStandardizer",
+    "normalize_minus_signs",
+    "StandardizationResult",
+    "ClarifierAgent",
+    "QueryAnalysis",
+    "evaluate_predictions",
+    "load_json_records",
+    "to_experiment_result",
+    "PROPERTY_CATALOG",
+    "PLATFORM_PROPERTY_KEYS",
+    "TEMPLATES",
+    "TEMPLATE_LABELS",
+    "build_extraction_prompt",
+    "detect_property_keys",
+    "normalize_property_key",
+    "property_display_name",
+]

literature/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (2 kB). View file

literature/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.42 kB). View file

literature/__pycache__/clarifier.cpython-310.pyc ADDED Viewed

Binary file (2.53 kB). View file

literature/__pycache__/clarifier.cpython-313.pyc ADDED Viewed

Binary file (3.13 kB). View file

literature/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (2.75 kB). View file

literature/__pycache__/config.cpython-313.pyc ADDED Viewed

Binary file (3.39 kB). View file

literature/__pycache__/converters.cpython-310.pyc ADDED Viewed

Binary file (1.76 kB). View file

literature/__pycache__/converters.cpython-313.pyc ADDED Viewed

Binary file (2.57 kB). View file

literature/__pycache__/discovery.cpython-310.pyc ADDED Viewed

Binary file (10.8 kB). View file

literature/__pycache__/discovery.cpython-313.pyc ADDED Viewed

Binary file (16.4 kB). View file

literature/__pycache__/evaluation.cpython-310.pyc ADDED Viewed

Binary file (5.28 kB). View file

literature/__pycache__/extraction.cpython-310.pyc ADDED Viewed

Binary file (22.6 kB). View file

literature/__pycache__/extraction.cpython-313.pyc ADDED Viewed

Binary file (30.2 kB). View file

literature/__pycache__/graph.cpython-313.pyc ADDED Viewed

Binary file (15.6 kB). View file

literature/__pycache__/property_registry.cpython-310.pyc ADDED Viewed

Binary file (8.36 kB). View file

literature/__pycache__/property_registry.cpython-313.pyc ADDED Viewed

Binary file (6.76 kB). View file

literature/__pycache__/quality.cpython-310.pyc ADDED Viewed

Binary file (5.71 kB). View file

literature/__pycache__/quality.cpython-313.pyc ADDED Viewed

Binary file (7.55 kB). View file

literature/__pycache__/retrieval.cpython-310.pyc ADDED Viewed

Binary file (9.92 kB). View file

literature/__pycache__/retrieval.cpython-313.pyc ADDED Viewed

Binary file (16 kB). View file

literature/__pycache__/schemas.cpython-310.pyc ADDED Viewed

Binary file (12.2 kB). View file

literature/__pycache__/schemas.cpython-313.pyc ADDED Viewed

Binary file (11.6 kB). View file

literature/__pycache__/standardizer.cpython-310.pyc ADDED Viewed

Binary file (6.41 kB). View file

literature/__pycache__/standardizer.cpython-313.pyc ADDED Viewed

Binary file (8.84 kB). View file

literature/clarifier.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, List
+from .property_registry import detect_property_keys, property_display_name
+POLYMER_KEYWORDS = {
+    "polymer",
+    "polyimide",
+    "peek",
+    "polyethylene",
+    "pedot",
+    "pedot:pss",
+    "p3ht",
+    "smiles",
+}
+CONDITION_KEYWORDS = {
+    "anneal",
+    "annealing",
+    "solvent",
+    "dopant",
+    "doping",
+    "spin coat",
+    "temperature",
+    "thickness",
+    "pressure",
+    "humidity",
+    "method",
+}
+@dataclass
+class QueryAnalysis:
+    original_query: str
+    detected_polymers: List[str]
+    detected_properties: List[str]
+    detected_conditions: List[str]
+    suggestions: List[str]
+    clarification_required: bool
+    status: str
+    def to_payload(self) -> Dict[str, object]:
+        return {
+            "original_query": self.original_query,
+            "detected_polymers": self.detected_polymers,
+            "detected_properties": self.detected_properties,
+            "detected_conditions": self.detected_conditions,
+            "suggestions": self.suggestions,
+            "clarification_required": self.clarification_required,
+            "status": self.status,
+        }
+class ClarifierAgent:
+    """
+    Lightweight clarifier for production search flows.
+    It nudges users toward material + property + condition context without
+    blocking valid free-form task queries.
+    """
+    def analyze(self, query: str) -> QueryAnalysis:
+        q = (query or "").lower()
+        polymers = [keyword for keyword in POLYMER_KEYWORDS if keyword in q]
+        properties = detect_property_keys(query or "")
+        conditions = [keyword for keyword in CONDITION_KEYWORDS if keyword in q]
+        suggestions: List[str] = []
+        if not polymers:
+            suggestions.append("Add a target polymer or material name.")
+        if not properties:
+            suggestions.append("Specify a key property focus, e.g. " + property_display_name("tg") + ".")
+        if not conditions:
+            suggestions.append("Add one processing or measurement condition if available.")
+        clarification_required = (not polymers) and (not properties)
+        status = "pending_clarification" if clarification_required else "ready"
+        return QueryAnalysis(
+            original_query=query,
+            detected_polymers=polymers,
+            detected_properties=properties,
+            detected_conditions=conditions,
+            suggestions=suggestions,
+            clarification_required=clarification_required,
+            status=status,
+        )

literature/config.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Configuration management for Literature Discovery module.
+Uses pydantic-settings for environment variable loading.
+"""
+from typing import Optional, List
+from pydantic import Field
+from pydantic_settings import BaseSettings
+from functools import lru_cache
+class LiteratureConfig(BaseSettings):
+    """Literature mining configuration."""
+    # API Keys
+    pubmed_email: str = Field(default="scholar@university.edu", alias="PUBMED_EMAIL")
+    pubmed_api_key: Optional[str] = Field(default=None, alias="PUBMED_API_KEY")
+    semantic_scholar_api_key: Optional[str] = Field(default=None, alias="SEMANTIC_SCHOLAR_API_KEY")
+    gemini_api_key: Optional[str] = Field(default=None, alias="GEMINI_API_KEY")
+    openai_api_key: Optional[str] = Field(default=None, alias="MY_OPEN_WEBUI_API_KEY")
+    openai_base_url: Optional[str] = Field(default=None, alias="OPENAI_BASE_URL")
+    pageindex_api_key: Optional[str] = Field(default=None, alias="PAGEINDEX_API_KEY")
+    # LLM Configuration
+    llm_model: str = Field(default="gemini/gemini-2.0-flash", alias="LLM_MODEL")
+    embedding_model: str = Field(default="gemini/text-embedding-004")
+    llm_temperature: float = Field(default=0.1, ge=0.0, le=1.0)
+    llm_max_tokens: int = Field(default=4096)
+    # Search Configuration
+    default_search_limit: int = Field(default=20)
+    pubmed_enabled: bool = Field(default=True)
+    arxiv_enabled: bool = Field(default=True)
+    semantic_scholar_enabled: bool = Field(default=True)  # Now enabled
+    # Rate Limiting (Semantic Scholar: 1 req/sec)
+    semantic_scholar_delay_s: float = Field(default=1.5)  # Slightly over 1s for safety
+    pubmed_delay_s: float = Field(default=0.5)
+    # Storage
+    pdf_storage_dir: str = Field(default="data/literature/raw_pdfs")
+    database_path: str = Field(default="data/literature/papers.db")
+    # Processing
+    max_concurrent_downloads: int = Field(default=3)
+    extraction_timeout_s: int = Field(default=120)
+    # PDF Download Headers (for avoiding 403)
+    user_agent: str = Field(
+        default="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+    )
+    # Target Polymers (for focused search)
+    target_polymers: List[str] = Field(
+        default=["PEDOT:PSS", "P3HT", "PBTTT", "P(NDI2OD-T2)", "PDPP-4T"]
+    )
+    # Extraction strategy: "paperqa" or "simple"
+    extraction_strategy: str = Field(default="simple")
+    literature_model_options: str = Field(default="[]", alias="LITERATURE_MODEL_OPTIONS")
+    model_config = {
+        "env_file": ".env",
+        "env_file_encoding": "utf-8",
+        "extra": "ignore",
+    }
+@lru_cache()
+def get_config() -> LiteratureConfig:
+    """Get configuration singleton."""
+    return LiteratureConfig()

literature/converters.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+Data model converters.
+This module is now schema-optional:
+- If legacy `src.utils.schema` exists, returns (Experiment, Result) objects.
+- Otherwise returns two plain dict payloads for compatibility.
+"""
+import time
+from typing import Any, Dict, Tuple
+from .schemas import PolymerDataPoint
+try:
+    from src.utils.schema import Experiment, Result  # type: ignore
+    HAS_LEGACY_SCHEMA = True
+except Exception:
+    Experiment = None  # type: ignore
+    Result = None  # type: ignore
+    HAS_LEGACY_SCHEMA = False
+def to_experiment_result(dp: PolymerDataPoint) -> Tuple[Any, Any]:
+    exp_id = f"lit_{dp.source_paper_id}_{int(time.time() * 1000)}"
+    exp_payload: Dict[str, Any] = {
+        "id": exp_id,
+        "polymer_id": dp.polymer_name,
+        "concentration_mg_ml": dp.concentration_mg_ml or 0.0,
+        "spin_speed_rpm": dp.spin_speed_rpm or 0,
+        "annealing_temp_c": dp.annealing_temp_c or 0.0,
+        "annealing_time_min": dp.annealing_time_min or 0.0,
+        "status": "completed",
+        "metadata": {
+            "dopant": dp.dopant,
+            "dopant_ratio": dp.dopant_ratio,
+            "solvent": dp.solvent,
+            "source_paper_id": dp.source_paper_id,
+            "source_table": dp.source_table_or_figure,
+            "quality_tier": dp.quality_tier.value,
+            "extraction_confidence": dp.extraction_confidence,
+            "film_thickness_nm": dp.film_thickness_nm,
+            "seebeck_coefficient_uv_k": dp.seebeck_coefficient_uv_k,
+            "power_factor_uw_m_k2": dp.power_factor_uw_m_k2,
+        },
+    }
+    res_payload: Dict[str, Any] = {
+        "experiment_id": exp_id,
+        "ec_s_cm": dp.electrical_conductivity_s_cm or 0.0,
+        "tc_w_mk": dp.thermal_conductivity_w_mk,
+        "xrd_crystallinity": dp.xrd_crystallinity_percent,
+        "xrd_pi_stacking_angstrom": dp.xrd_pi_stacking_angstrom,
+        "source": "literature",
+    }
+    if HAS_LEGACY_SCHEMA:
+        return Experiment(**exp_payload), Result(**res_payload)  # type: ignore
+    return exp_payload, res_payload

literature/discovery.py ADDED Viewed

	@@ -0,0 +1,380 @@

+"""
+Multi-source paper discovery module.
+Implements PubMed, ArXiv, and Semantic Scholar search.
+Uses synchronous code for MVP simplicity.
+"""
+import logging
+import time
+from typing import List, Optional
+import arxiv
+from Bio import Entrez
+from .schemas import PaperMetadata, PaperSource
+from .config import get_config
+logger = logging.getLogger(__name__)
+_SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED = False
+class ArxivSearcher:
+    """ArXiv paper searcher."""
+    def __init__(self) -> None:
+        self.client = arxiv.Client()
+    def search(self, query: str, limit: int = 10) -> List[PaperMetadata]:
+        """
+        Search ArXiv for papers.
+        Args:
+            query: Search query string
+            limit: Maximum number of results
+        Returns:
+            List of PaperMetadata objects
+        """
+        logger.info(f"Searching ArXiv: '{query}' (limit={limit})")
+        search = arxiv.Search(
+            query=query,
+            max_results=limit,
+            sort_by=arxiv.SortCriterion.Relevance
+        )
+        papers: List[PaperMetadata] = []
+        try:
+            for result in self.client.results(search):
+                # Extract arxiv ID without version
+                arxiv_id = result.entry_id.split('/')[-1].split('v')[0]
+                paper = PaperMetadata(
+                    id=f"arxiv_{arxiv_id}",
+                    title=result.title,
+                    authors=[a.name for a in result.authors],
+                    year=result.published.year if result.published else None,
+                    doi=result.doi,
+                    abstract=result.summary,
+                    venue="arXiv",
+                    citation_count=None,
+                    is_open_access=True,
+                    source=PaperSource.ARXIV,
+                    url=result.entry_id,
+                    landing_url=result.entry_id,
+                    pdf_url=result.pdf_url,
+                )
+                papers.append(paper)
+        except Exception as e:
+            logger.error(f"ArXiv search failed: {e}")
+        logger.info(f"ArXiv returned {len(papers)} papers")
+        return papers
+class PubMedSearcher:
+    """PubMed paper searcher using Biopython Entrez."""
+    def __init__(self) -> None:
+        config = get_config()
+        Entrez.email = config.pubmed_email
+        if config.pubmed_api_key:
+            Entrez.api_key = config.pubmed_api_key
+        self.delay = config.pubmed_delay_s
+    def search(self, query: str, limit: int = 10) -> List[PaperMetadata]:
+        """
+        Search PubMed for papers.
+        Args:
+            query: Search query string
+            limit: Maximum number of results
+        Returns:
+            List of PaperMetadata objects
+        """
+        logger.info(f"Searching PubMed: '{query}' (limit={limit})")
+        try:
+            # Step 1: Search for IDs
+            handle = Entrez.esearch(db="pubmed", term=query, retmax=limit)
+            record = Entrez.read(handle)
+            handle.close()
+            id_list = record.get("IdList", [])
+            if not id_list:
+                logger.info("PubMed returned 0 papers")
+                return []
+            time.sleep(self.delay)
+            # Step 2: Fetch details in XML format
+            handle = Entrez.efetch(
+                db="pubmed",
+                id=id_list,
+                rettype="xml",
+                retmode="xml"
+            )
+            records = Entrez.read(handle)
+            handle.close()
+            papers: List[PaperMetadata] = []
+            for article in records.get("PubmedArticle", []):
+                try:
+                    paper = self._parse_pubmed_article(article)
+                    if paper:
+                        papers.append(paper)
+                except Exception as e:
+                    logger.warning(f"Failed to parse PubMed article: {e}")
+            logger.info(f"PubMed returned {len(papers)} papers")
+            return papers
+        except Exception as e:
+            logger.error(f"PubMed search failed: {e}")
+            return []
+    def _parse_pubmed_article(self, article: dict) -> Optional[PaperMetadata]:
+        """Parse a single PubMed article into PaperMetadata."""
+        medline = article.get("MedlineCitation", {})
+        article_data = medline.get("Article", {})
+        # Extract PMID
+        pmid = str(medline.get("PMID", ""))
+        if not pmid:
+            return None
+        # Extract title
+        title = article_data.get("ArticleTitle", "Unknown Title")
+        if isinstance(title, list):
+            title = " ".join(str(t) for t in title)
+        # Extract authors
+        authors: List[str] = []
+        author_list = article_data.get("AuthorList", [])
+        for author in author_list:
+            if isinstance(author, dict):
+                last_name = author.get("LastName", "")
+                fore_name = author.get("ForeName", "")
+                if last_name:
+                    authors.append(f"{fore_name} {last_name}".strip())
+        # Extract year
+        year = None
+        pub_date = article_data.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
+        if "Year" in pub_date:
+            try:
+                year = int(pub_date["Year"])
+            except (ValueError, TypeError):
+                pass
+        # Extract abstract
+        abstract = ""
+        abstract_text = article_data.get("Abstract", {}).get("AbstractText", [])
+        if isinstance(abstract_text, list):
+            abstract = " ".join(str(t) for t in abstract_text)
+        elif isinstance(abstract_text, str):
+            abstract = abstract_text
+        # Extract DOI
+        doi = None
+        id_list = article_data.get("ELocationID", [])
+        for eid in id_list:
+            if hasattr(eid, "attributes") and eid.attributes.get("EIdType") == "doi":
+                doi = str(eid)
+                break
+        journal = article_data.get("Journal", {})
+        journal_title = journal.get("Title")
+        return PaperMetadata(
+            id=f"pubmed_{pmid}",
+            title=str(title),
+            authors=authors,
+            year=year,
+            doi=doi,
+            abstract=abstract,
+            venue=str(journal_title) if journal_title else None,
+            citation_count=None,
+            is_open_access=None,
+            source=PaperSource.PUBMED,
+            url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
+            landing_url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
+        )
+class SemanticScholarSearcher:
+    """Semantic Scholar paper searcher (with rate limiting)."""
+    def __init__(self) -> None:
+        config = get_config()
+        self.api_key = config.semantic_scholar_api_key
+        self.delay = config.semantic_scholar_delay_s
+    def search(self, query: str, limit: int = 10) -> List[PaperMetadata]:
+        """
+        Search Semantic Scholar for papers.
+        Rate limited to avoid 403 errors.
+        Args:
+            query: Search query string
+            limit: Maximum number of results
+        Returns:
+            List of PaperMetadata objects
+        """
+        logger.info(f"Searching Semantic Scholar: '{query}' (limit={limit})")
+        # Lazy import to avoid dependency issues
+        try:
+            from semanticscholar import SemanticScholar
+        except ImportError:
+            global _SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED
+            if not _SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED:
+                logger.debug("semanticscholar package not installed; Semantic Scholar source disabled.")
+                _SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED = True
+            return []
+        time.sleep(self.delay)  # Initial delay
+        try:
+            client = SemanticScholar(api_key=self.api_key)
+            results = client.search_paper(
+                query,
+                limit=limit,
+                fields=['title', 'abstract', 'authors', 'year', 'externalIds', 'url', 'isOpenAccess', 'openAccessPdf', 'venue', 'citationCount']
+            )
+            papers: List[PaperMetadata] = []
+            for item in results:
+                if len(papers) >= limit:
+                    break
+                # Get PDF URL if available
+                pdf_url = None
+                if item.openAccessPdf and isinstance(item.openAccessPdf, dict):
+                    pdf_url = item.openAccessPdf.get('url')
+                paper = PaperMetadata(
+                    id=f"s2_{item.paperId}",
+                    title=item.title or "Unknown",
+                    authors=[a.name for a in (item.authors or [])],
+                    year=item.year,
+                    doi=item.externalIds.get("DOI") if item.externalIds else None,
+                    abstract=item.abstract,
+                    venue=getattr(item, "venue", None),
+                    citation_count=getattr(item, "citationCount", None),
+                    is_open_access=bool(getattr(item, "isOpenAccess", False)),
+                    source=PaperSource.SEMANTIC_SCHOLAR,
+                    url=item.url,
+                    landing_url=item.url,
+                    pdf_url=pdf_url,
+                )
+                papers.append(paper)
+                time.sleep(self.delay)  # Rate limit between items
+            logger.info(f"Semantic Scholar returned {len(papers)} papers")
+            return papers
+        except Exception as e:
+            logger.warning(f"Semantic Scholar search failed (likely 403): {e}")
+            return []
+class PaperDiscoveryAgent:
+    """
+    Paper discovery agent.
+    Aggregates multiple search sources, deduplicates, and sorts results.
+    """
+    def __init__(self) -> None:
+        config = get_config()
+        self.searchers: List[tuple] = []
+        if config.arxiv_enabled:
+            self.searchers.append(("arxiv", ArxivSearcher()))
+        if config.pubmed_enabled:
+            self.searchers.append(("pubmed", PubMedSearcher()))
+        if config.semantic_scholar_enabled:
+            self.searchers.append(("semantic_scholar", SemanticScholarSearcher()))
+        logger.info(f"Initialized PaperDiscoveryAgent with sources: {[s[0] for s in self.searchers]}")
+    def discover(
+        self,
+        query: str,
+        limit_per_source: int = 10,
+        deduplicate: bool = True
+    ) -> List[PaperMetadata]:
+        """
+        Search all sources and aggregate results.
+        Args:
+            query: Search query
+            limit_per_source: Maximum results per source
+            deduplicate: Whether to deduplicate by title
+        Returns:
+            Aggregated list of papers
+        """
+        all_papers: List[PaperMetadata] = []
+        for source_name, searcher in self.searchers:
+            try:
+                papers = searcher.search(query, limit_per_source)
+                all_papers.extend(papers)
+                logger.info(f"{source_name} returned {len(papers)} papers")
+            except Exception as e:
+                logger.error(f"Search failed for {source_name}: {e}")
+        logger.info(f"Total papers before deduplication: {len(all_papers)}")
+        if deduplicate:
+            all_papers = self._deduplicate(all_papers)
+            logger.info(f"Total papers after deduplication: {len(all_papers)}")
+        return all_papers
+    def _deduplicate(self, papers: List[PaperMetadata]) -> List[PaperMetadata]:
+        """Deduplicate papers by normalized title."""
+        seen_titles: set = set()
+        unique_papers: List[PaperMetadata] = []
+        for paper in papers:
+            # Normalize title for comparison
+            normalized = paper.title.lower().strip()
+            if normalized not in seen_titles:
+                seen_titles.add(normalized)
+                unique_papers.append(paper)
+        return unique_papers
+    def build_thermoelectric_query(
+        self,
+        polymer: Optional[str] = None,
+        include_tc: bool = True
+    ) -> str:
+        """
+        Build a specialized thermoelectric search query.
+        Args:
+            polymer: Specific polymer name (e.g., "P3HT")
+            include_tc: Whether to include thermal conductivity keywords
+        Returns:
+            Optimized search query string
+        """
+        base_terms = [
+            "organic thermoelectric",
+            "conjugated polymer",
+            "electrical conductivity",
+        ]
+        if include_tc:
+            base_terms.append("thermal conductivity")
+        if polymer:
+            base_terms.insert(0, polymer)
+        query = " ".join(base_terms)
+        logger.debug(f"Built query: {query}")
+        return query

literature/evaluation.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Offline evaluation helpers for structured literature extraction.
+The harness is intentionally dataset-agnostic so POLYIE-formatted exports and
+internal regression sets can share the same metric implementation.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Sequence, Tuple
+from .property_registry import normalize_property_key
+CORE_FIELDS = ["material_name", "property_key", "raw_value", "raw_unit", "method"]
+def load_json_records(path: str | Path) -> List[Dict[str, Any]]:
+    fp = Path(path)
+    if fp.suffix == ".jsonl":
+        return [json.loads(line) for line in fp.read_text(encoding="utf-8").splitlines() if line.strip()]
+    data = json.loads(fp.read_text(encoding="utf-8"))
+    if isinstance(data, list):
+        return data
+    raise ValueError(f"Unsupported evaluation file format: {fp}")
+def normalize_record(record: Dict[str, Any]) -> Dict[str, Any]:
+    material = str(
+        record.get("material_name")
+        or record.get("polymer_name")
+        or record.get("material")
+        or ""
+    ).strip()
+    property_key = normalize_property_key(
+        str(record.get("property_key") or record.get("property_name") or "")
+    ) or str(record.get("property_key") or record.get("property_name") or "").strip()
+    raw_value = str(record.get("raw_value") or record.get("value") or "").strip()
+    raw_unit = str(record.get("raw_unit") or record.get("unit") or "").strip()
+    method = str(record.get("method") or record.get("measurement_method") or "").strip()
+    evidence_quote = str(record.get("evidence_quote") or record.get("source_quote") or "").strip()
+    return {
+        "material_name": material,
+        "property_key": property_key,
+        "raw_value": raw_value,
+        "raw_unit": raw_unit,
+        "method": method,
+        "evidence_quote": evidence_quote,
+    }
+def _safe_div(numerator: float, denominator: float) -> float:
+    return numerator / denominator if denominator else 0.0
+def _f1(precision: float, recall: float) -> float:
+    return (2 * precision * recall) / (precision + recall) if (precision + recall) else 0.0
+def _field_pairs(records: Sequence[Dict[str, Any]], field: str) -> set[Tuple[str, str]]:
+    pairs = set()
+    for record in records:
+        normalized = normalize_record(record)
+        key = normalized.get("material_name", "")
+        value = normalized.get(field, "")
+        if key and value:
+            pairs.add((key.lower(), value.lower()))
+    return pairs
+def _relation_tuples(records: Sequence[Dict[str, Any]]) -> set[Tuple[str, str, str]]:
+    triples = set()
+    for record in records:
+        normalized = normalize_record(record)
+        if normalized["material_name"] and normalized["property_key"] and normalized["raw_value"]:
+            triples.add(
+                (
+                    normalized["material_name"].lower(),
+                    normalized["property_key"].lower(),
+                    normalized["raw_value"].lower(),
+                )
+            )
+    return triples
+def _record_tuples(records: Sequence[Dict[str, Any]]) -> set[Tuple[str, str, str, str, str]]:
+    tuples = set()
+    for record in records:
+        normalized = normalize_record(record)
+        tuples.add(
+            tuple(normalized[field].lower() for field in CORE_FIELDS)
+        )
+    return tuples
+def evaluate_predictions(
+    gold_records: Sequence[Dict[str, Any]],
+    predicted_records: Sequence[Dict[str, Any]],
+) -> Dict[str, Any]:
+    gold = [normalize_record(record) for record in gold_records]
+    predicted = [normalize_record(record) for record in predicted_records]
+    field_metrics: Dict[str, Dict[str, float]] = {}
+    for field in CORE_FIELDS:
+        gold_pairs = _field_pairs(gold, field)
+        predicted_pairs = _field_pairs(predicted, field)
+        tp = len(gold_pairs & predicted_pairs)
+        precision = _safe_div(tp, len(predicted_pairs))
+        recall = _safe_div(tp, len(gold_pairs))
+        field_metrics[field] = {
+            "precision": precision,
+            "recall": recall,
+            "f1": _f1(precision, recall),
+        }
+    gold_rel = _relation_tuples(gold)
+    pred_rel = _relation_tuples(predicted)
+    rel_tp = len(gold_rel & pred_rel)
+    rel_precision = _safe_div(rel_tp, len(pred_rel))
+    rel_recall = _safe_div(rel_tp, len(gold_rel))
+    gold_records_set = _record_tuples(gold)
+    pred_records_set = _record_tuples(predicted)
+    record_tp = len(gold_records_set & pred_records_set)
+    record_precision = _safe_div(record_tp, len(pred_records_set))
+    record_recall = _safe_div(record_tp, len(gold_records_set))
+    filled_fields = [
+        sum(1 for field in CORE_FIELDS if record.get(field))
+        for record in predicted
+    ]
+    record_completeness = _safe_div(sum(filled_fields), len(predicted) * len(CORE_FIELDS))
+    source_grounding_hit_rate = _safe_div(
+        sum(1 for record in predicted if record.get("evidence_quote")),
+        len(predicted),
+    )
+    return {
+        "field_metrics": field_metrics,
+        "relation_level": {
+            "precision": rel_precision,
+            "recall": rel_recall,
+            "f1": _f1(rel_precision, rel_recall),
+        },
+        "record_level": {
+            "precision": record_precision,
+            "recall": record_recall,
+            "f1": _f1(record_precision, record_recall),
+        },
+        "record_completeness": record_completeness,
+        "source_grounding_hit_rate": source_grounding_hit_rate,
+        "gold_count": len(gold),
+        "predicted_count": len(predicted),
+    }

literature/extraction.py ADDED Viewed

	@@ -0,0 +1,863 @@

+"""
+LLM-based structured data extraction module.
+Implements flexible interface: PageIndex (RAG via indexed PDFs) or Simple extraction (fallback).
+Prompts are dynamically built from user-selected target properties via
+``literature.property_registry.build_extraction_prompt``.
+"""
+import json
+import re
+import logging
+import os
+from typing import List, Optional, Any
+from datetime import datetime
+from .schemas import (
+    PaperMetadata,
+    PolymerDataPoint,
+    ExtractionResult,
+    DataQuality
+)
+from .config import get_config
+from .retrieval import extract_text_from_pdf
+from .property_registry import PROPERTY_CATALOG, build_extraction_prompt, TEMPLATES
+logger = logging.getLogger(__name__)
+# Default property set used when no explicit target properties are provided.
+# The legacy thermoelectric-only template no longer exists in the production
+# registry, so fall back to the platform-wide property core.
+_DEFAULT_PROPERTIES = TEMPLATES.get("platform_core") or list(PROPERTY_CATALOG.keys())
+_SKIP_ERROR_MESSAGES = {
+    "llm_unconfigured",
+    "contextual_llm_unconfigured",
+    "extraction_backend_unconfigured",
+    "pageindex_requires_pdf_no_simple_backend",
+    "pageindex_sdk_unavailable_no_simple_backend",
+}
+def _normalize_base_url(url: Optional[str]) -> Optional[str]:
+    text = str(url or "").strip().rstrip("/")
+    return text or None
+def _is_http_url(url: Optional[str]) -> bool:
+    text = _normalize_base_url(url)
+    return bool(text and (text.startswith("http://") or text.startswith("https://")))
+def is_expected_skip_error(error_message: Optional[str]) -> bool:
+    return str(error_message or "").strip() in _SKIP_ERROR_MESSAGES
+# ============== JSON Safe Parsing (Fix Logic Bug #4 & #5) ==============
+def normalize_minus_signs(s: str) -> str:
+    """
+    Normalize all types of minus signs to ASCII minus.
+    Fixes Logic Bug #5: OCR may produce Unicode minus (U+2212) instead of ASCII.
+    """
+    minus_chars = [
+        '−',  # U+2212 MINUS SIGN
+        '–',  # U+2013 EN DASH
+        '—',  # U+2014 EM DASH
+        '‐',  # U+2010 HYPHEN
+        '‑',  # U+2011 NON-BREAKING HYPHEN
+        '‒',  # U+2012 FIGURE DASH
+        '⁻',  # U+207B SUPERSCRIPT MINUS
+        '₋',  # U+208B SUBSCRIPT MINUS
+    ]
+    for char in minus_chars:
+        s = s.replace(char, '-')
+    return s
+def safe_json_loads(text: str) -> Any:
+    """
+    Safely parse JSON, handling common LLM output issues.
+    Fixes Logic Bug #4: LLM may return NaN, Infinity, Python-style None, trailing commas.
+    """
+    if not text:
+        return None
+    text = text.strip()
+    # Extract JSON from markdown code blocks
+    if "```json" in text:
+        text = text.split("```json")[1].split("```")[0]
+    elif "```" in text:
+        parts = text.split("```")
+        if len(parts) >= 2:
+            text = parts[1]
+    # Normalize minus signs
+    text = normalize_minus_signs(text)
+    # Fix Python-style -> JSON-style
+    text = re.sub(r'\bNone\b', 'null', text)
+    text = re.sub(r'\bTrue\b', 'true', text)
+    text = re.sub(r'\bFalse\b', 'false', text)
+    # Remove trailing commas
+    text = re.sub(r',\s*}', '}', text)
+    text = re.sub(r',\s*]', ']', text)
+    # Handle NaN and Infinity
+    text = re.sub(r'\bNaN\b', 'null', text)
+    text = re.sub(r'\bInfinity\b', 'null', text)
+    text = re.sub(r'-Infinity\b', 'null', text)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError as e:
+        logger.warning(f"Initial JSON parse failed: {e}")
+        # Try json_repair if available
+        try:
+            from json_repair import repair_json
+            repaired = repair_json(text)
+            return json.loads(repaired)
+        except ImportError:
+            logger.warning("json_repair not installed, cannot repair JSON")
+            raise
+        except Exception as e2:
+            logger.error(f"JSON repair also failed: {e2}")
+            raise
+# Extraction prompt template
+EXTRACTION_PROMPT = """
+You are an expert in organic thermoelectrics and polymer science.
+Your task is to extract ALL experimental data points from the provided paper.
+## Target Data
+Extract data for conjugated polymers used in thermoelectric applications, including:
+- PEDOT:PSS, P3HT, PBTTT, P(NDI2OD-T2), PDPP series, etc.
+## Required Fields (extract as many as available)
+For EACH data point, extract:
+### Material Information
+- polymer_name: The polymer name/abbreviation (e.g., "P3HT", "PEDOT:PSS")
+- dopant: Dopant used (e.g., "DMSO", "H2SO4", "FeCl3")
+- dopant_ratio: Dopant concentration if specified (e.g., "5 wt%", "1 M")
+### Processing Conditions
+- solvent: Solvent used for film preparation
+- concentration_mg_ml: Solution concentration in mg/mL
+- spin_speed_rpm: Spin coating speed in RPM
+- spin_time_s: Spin coating time in seconds
+- annealing_temp_c: Annealing temperature in Celsius
+- annealing_time_min: Annealing time in minutes
+- annealing_atmosphere: Atmosphere during annealing (N2, Air, Vacuum)
+- film_thickness_nm: Film thickness in nanometers
+### Electrical Properties
+- electrical_conductivity_s_cm: Electrical conductivity in S/cm
+- seebeck_coefficient_uv_k: Seebeck coefficient in μV/K
+- power_factor_uw_m_k2: Power factor in μW/(m·K²)
+### Thermal Properties (IMPORTANT - often sparse)
+- thermal_conductivity_w_mk: Thermal conductivity in W/(m·K)
+- zt_figure_of_merit: ZT figure of merit (dimensionless)
+### Structural Characterization
+- xrd_crystallinity_percent: Crystallinity percentage from XRD
+- xrd_pi_stacking_angstrom: π-π stacking distance in Angstrom
+- xrd_lamellar_spacing_angstrom: Lamellar spacing in Angstrom
+### Metadata
+- source_table_or_figure: Where the data was found (e.g., "Table 1", "Figure 3")
+- extraction_confidence: Your confidence in this extraction (0.0 to 1.0)
+## CRITICAL Rules
+1. Extract ONLY experimentally measured values, not theoretical predictions
+2. Convert all units to the specified standard units
+3. If a value range is given (e.g., "100-200 S/cm"), use the AVERAGE
+4. If a value is "not measured" or "N/A", use null
+5. Each row in a table = one data point
+6. Include the source_table_or_figure for traceability
+## Output Format
+Return a valid JSON array. Example:
+[
+  {
+    "polymer_name": "PEDOT:PSS",
+    "dopant": "H2SO4",
+    "dopant_ratio": "5 vol%",
+    "electrical_conductivity_s_cm": 1200.5,
+    "thermal_conductivity_w_mk": 0.35,
+    "source_table_or_figure": "Table 2",
+    "extraction_confidence": 0.9
+  }
+]
+Return ONLY the JSON array, no markdown formatting, no explanations.
+If no relevant data is found, return an empty array: []
+"""
+class DataExtractor:
+    """
+    Flexible data extractor with fallback strategy.
+    Primary: PageIndex (RAG via indexed PDFs)
+    Fallback: Simple extraction (pymupdf + direct LLM)
+    """
+    def __init__(
+        self,
+        strategy: Optional[str] = None,
+        target_properties: Optional[List[str]] = None,
+        extra_instructions: str = "",
+    ) -> None:
+        config = get_config()
+        self.strategy = strategy or config.extraction_strategy
+        self.llm_model = config.llm_model
+        self.gemini_key = config.gemini_api_key
+        self.openai_key = config.openai_api_key
+        self.openai_base_url = _normalize_base_url(config.openai_base_url)
+        self.pdf_dir = config.pdf_storage_dir
+        self.pageindex_api_key = config.pageindex_api_key
+        self.target_properties = target_properties or _DEFAULT_PROPERTIES
+        self.extra_instructions = extra_instructions
+        logger.info(f"Initialized DataExtractor with strategy: {self.strategy}, properties: {self.target_properties}")
+    def has_openai_backend(self) -> bool:
+        return _is_http_url(self.openai_base_url)
+    def has_any_llm_backend(self) -> bool:
+        return self.has_openai_backend() or bool(str(self.gemini_key or "").strip())
+    def has_pageindex_backend(self) -> bool:
+        return bool(str(self.pageindex_api_key or "").strip())
+    def can_attempt_extraction(self) -> bool:
+        return self.has_pageindex_backend() or self.has_any_llm_backend()
+    def availability_reason(self) -> Optional[str]:
+        if self.can_attempt_extraction():
+            return None
+        return "Structured extraction skipped: configure PAGEINDEX_API_KEY or a valid LLM backend."
+    def extract_from_papers(
+        self,
+        papers: List[PaperMetadata],
+        use_full_text: bool = True
+    ) -> List[ExtractionResult]:
+        """
+        Extract data from multiple papers.
+        Args:
+            papers: List of paper metadata (with pdf_path if available)
+            use_full_text: Use PDF full text if available
+        Returns:
+            List of extraction results
+        """
+        results: List[ExtractionResult] = []
+        for paper in papers:
+            try:
+                if self.strategy == "pageindex":
+                    result = self._extract_with_pageindex(paper)
+                else:
+                    result = self._extract_simple(paper, use_full_text)
+                results.append(result)
+            except Exception as e:
+                logger.error(f"Extraction failed for {paper.id}: {e}")
+                results.append(ExtractionResult(
+                    paper=paper,
+                    success=False,
+                    error_message=str(e)
+                ))
+        return results
+    def _extract_simple(
+        self,
+        paper: PaperMetadata,
+        use_full_text: bool = True
+    ) -> ExtractionResult:
+        """
+        Simple extraction: Extract PDF text -> Feed to LLM -> Parse JSON.
+        Often more effective for metadata extraction.
+        """
+        logger.info(f"Simple extraction for: {paper.title[:50]}...")
+        # Get content
+        content = self._prepare_content(paper, use_full_text)
+        if not content:
+            return ExtractionResult(
+                paper=paper,
+                success=False,
+                error_message="No content available"
+            )
+        if not self.has_any_llm_backend():
+            return ExtractionResult(
+                paper=paper,
+                success=False,
+                error_message="llm_unconfigured",
+                extraction_notes="Simple extraction skipped because no LLM backend is configured.",
+            )
+        # Call LLM with dynamic prompt
+        dynamic_prompt = build_extraction_prompt(self.target_properties, self.extra_instructions)
+        prompt = dynamic_prompt.replace("{title}", paper.title or "Unknown").replace("{content}", content)
+        try:
+            raw_response = self._call_llm(prompt)
+            if not raw_response:
+                return ExtractionResult(
+                    paper=paper,
+                    success=False,
+                    error_message="LLM returned empty response"
+                )
+            # Parse response
+            data_points = self._parse_llm_output(raw_response, paper.id)
+            # Assess quality for each point
+            for dp in data_points:
+                dp.quality_tier = self._assess_quality(dp)
+            return ExtractionResult(
+                paper=paper,
+                data_points=data_points,
+                llm_model_used=self.llm_model,
+                extraction_timestamp=datetime.now(),
+                success=True
+            )
+        except Exception as e:
+            logger.error(f"Simple extraction failed: {e}")
+            return ExtractionResult(
+                paper=paper,
+                success=False,
+                error_message=str(e)
+            )
+    def _extract_with_pageindex(self, paper: PaperMetadata) -> ExtractionResult:
+        """
+        PageIndex extraction (RAG-enhanced via indexed PDF).
+        Submits PDF to PageIndex, then uses chat_completions with extraction prompt.
+        Falls back to simple extraction if PageIndex is unavailable or fails.
+        """
+        if not self.has_pageindex_backend():
+            if self.has_any_llm_backend():
+                return self._extract_simple(paper)
+            return ExtractionResult(
+                paper=paper,
+                success=False,
+                error_message="extraction_backend_unconfigured",
+                extraction_notes="No PageIndex or LLM backend is configured.",
+            )
+        if not paper.pdf_path or not os.path.exists(paper.pdf_path):
+            if self.has_any_llm_backend():
+                return self._extract_simple(paper)
+            return ExtractionResult(
+                paper=paper,
+                success=False,
+                error_message="pageindex_requires_pdf_no_simple_backend",
+                extraction_notes="PageIndex extraction requires a PDF when no simple LLM fallback is available.",
+            )
+        try:
+            from src.literature_service.pageindex_client import PageIndexService
+        except ImportError:
+            if self.has_any_llm_backend():
+                return self._extract_simple(paper)
+            return ExtractionResult(
+                paper=paper,
+                success=False,
+                error_message="pageindex_sdk_unavailable_no_simple_backend",
+                extraction_notes="PageIndex SDK unavailable and no simple LLM fallback is configured.",
+            )
+        logger.info(f"PageIndex extraction for: {paper.title[:50]}...")
+        try:
+            service = PageIndexService(api_key=self.pageindex_api_key)
+            # Submit the document to PageIndex
+            doc_id = service.submit_document(paper.pdf_path)
+            logger.info(f"Submitted to PageIndex, doc_id={doc_id}")
+            # Wait for indexing to complete (poll status)
+            import time
+            for _ in range(30):  # max ~60 seconds
+                status = service.get_document_status(doc_id)
+                if status == "completed":
+                    break
+                if status in ("error", "failed"):
+                    raise RuntimeError(f"PageIndex indexing failed with status: {status}")
+                time.sleep(2)
+            else:
+                logger.warning("PageIndex indexing timed out, falling back to simple")
+                return self._extract_simple(paper)
+            # Use chat_completions with dynamic extraction prompt
+            dynamic_prompt = build_extraction_prompt(self.target_properties, self.extra_instructions)
+            # For PageIndex chat, we don't need the {title}/{content} placeholders
+            # since the document is already indexed; strip those sections.
+            pi_prompt = dynamic_prompt.split("**PAPER CONTENT:**")[0].strip()
+            raw_answer = service.chat_completions(pi_prompt, doc_id)
+            if not raw_answer:
+                return ExtractionResult(
+                    paper=paper,
+                    success=False,
+                    error_message="PageIndex returned empty response"
+                )
+            # Parse result
+            data_points = self._parse_llm_output(raw_answer, paper.id)
+            for dp in data_points:
+                dp.quality_tier = self._assess_quality(dp)
+            return ExtractionResult(
+                paper=paper,
+                data_points=data_points,
+                llm_model_used="pageindex",
+                extraction_timestamp=datetime.now(),
+                success=True
+            )
+        except Exception as e:
+            logger.warning(f"PageIndex extraction failed, falling back to simple: {e}")
+            return self._extract_simple(paper)
+    def _prepare_content(
+        self,
+        paper: PaperMetadata,
+        use_full_text: bool = True
+    ) -> Optional[str]:
+        """Prepare text content for extraction."""
+        # Try PDF full text first
+        if use_full_text and paper.pdf_path and os.path.exists(paper.pdf_path):
+            full_text = extract_text_from_pdf(paper.pdf_path, max_pages=5)
+            if full_text:
+                return f"Title: {paper.title}\n\n{full_text}"
+        # Fallback to abstract
+        if paper.abstract:
+            return f"Title: {paper.title}\n\nAbstract:\n{paper.abstract}"
+        # Just title
+        if paper.title:
+            return f"Title: {paper.title}"
+        return None
+    def _call_llm(self, prompt: str) -> Optional[str]:
+        """
+        Call LLM (OpenAI-compatible first, then Gemini fallback).
+        Prioritizes CRC OpenWebUI for reliability.
+        """
+        # Try OpenAI-compatible (CRC) first
+        if self.openai_key and self.openai_base_url:
+            try:
+                logger.info(f"Calling CRC OpenWebUI...")
+                return self._call_openai_compatible(prompt)
+            except Exception as e:
+                logger.warning(f"CRC OpenWebUI call failed: {e}")
+        # Fallback to Gemini
+        if self.gemini_key:
+            try:
+                logger.info("Falling back to Gemini...")
+                return self._call_gemini(prompt)
+            except Exception as e:
+                logger.warning(f"Gemini call failed: {e}")
+        logger.debug("No LLM backend configured; skipping simple extraction call.")
+        return None
+    def _call_gemini(self, prompt: str) -> str:
+        """Call Gemini API."""
+        import google.generativeai as genai
+        genai.configure(api_key=self.gemini_key)
+        model = genai.GenerativeModel("gemini-2.0-flash")
+        response = model.generate_content(prompt)
+        return response.text
+    def _call_openai_compatible(self, prompt: str) -> str:
+        """Call OpenAI-compatible API (CRC OpenWebUI)."""
+        from openai import OpenAI
+        client = OpenAI(
+            api_key=self.openai_key,
+            base_url=self.openai_base_url
+        )
+        # Use model from config (set in .env LLM_MODEL)
+        model = self.llm_model
+        # Handle litellm-style prefixes
+        if model.startswith("gemini/"):
+            model = "gpt-oss:latest"  # Fallback for CRC
+        logger.info(f"Using model: {model}")
+        response = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.1
+        )
+        return response.choices[0].message.content
+    def _parse_llm_output(
+        self,
+        raw_output: str,
+        paper_id: str
+    ) -> List[PolymerDataPoint]:
+        """Parse LLM output into structured data points."""
+        try:
+            # Use safe_json_loads for robust parsing
+            raw_data = safe_json_loads(raw_output)
+        except Exception as e:
+            logger.error(f"JSON parsing failed for {paper_id}: {e}")
+            return []
+        if raw_data is None:
+            logger.warning(f"No JSON data found in output for {paper_id}")
+            return []
+        # Ensure it's a list
+        if not isinstance(raw_data, list):
+            raw_data = [raw_data]
+        # Convert to Pydantic models
+        data_points: List[PolymerDataPoint] = []
+        for item in raw_data:
+            try:
+                dp = PolymerDataPoint(
+                    polymer_name=item.get("polymer_name", "Unknown"),
+                    dopant=item.get("dopant"),
+                    dopant_ratio=item.get("dopant_ratio"),
+                    solvent=item.get("solvent"),
+                    concentration_mg_ml=item.get("concentration_mg_ml"),
+                    spin_speed_rpm=item.get("spin_speed_rpm"),
+                    spin_time_s=item.get("spin_time_s"),
+                    annealing_temp_c=item.get("annealing_temp_c"),
+                    annealing_time_min=item.get("annealing_time_min"),
+                    annealing_atmosphere=item.get("annealing_atmosphere"),
+                    film_thickness_nm=item.get("film_thickness_nm"),
+                    electrical_conductivity_s_cm=item.get("electrical_conductivity_s_cm"),
+                    seebeck_coefficient_uv_k=item.get("seebeck_coefficient_uv_k"),
+                    power_factor_uw_m_k2=item.get("power_factor_uw_m_k2"),
+                    thermal_conductivity_w_mk=item.get("thermal_conductivity_w_mk"),
+                    zt_figure_of_merit=item.get("zt_figure_of_merit"),
+                    xrd_crystallinity_percent=item.get("xrd_crystallinity_percent"),
+                    xrd_pi_stacking_angstrom=item.get("xrd_pi_stacking_angstrom"),
+                    xrd_lamellar_spacing_angstrom=item.get("xrd_lamellar_spacing_angstrom"),
+                    source_paper_id=paper_id,
+                    source_table_or_figure=item.get("source_table_or_figure"),
+                    extraction_confidence=item.get("extraction_confidence", 0.5),
+                )
+                data_points.append(dp)
+            except Exception as e:
+                logger.warning(f"Failed to parse data point: {e}")
+        logger.info(f"Extracted {len(data_points)} data points from {paper_id}")
+        return data_points
+    def _assess_quality(self, dp: PolymerDataPoint) -> DataQuality:
+        """Assess data point quality tier."""
+        has_ec = dp.electrical_conductivity_s_cm is not None
+        has_tc = dp.thermal_conductivity_w_mk is not None
+        has_xrd = (dp.xrd_crystallinity_percent is not None or
+                   dp.xrd_pi_stacking_angstrom is not None)
+        has_process = (dp.annealing_temp_c is not None and
+                       dp.spin_speed_rpm is not None)
+        if has_ec and has_tc and has_xrd and has_process:
+            return DataQuality.GOLD
+        elif has_ec and (has_xrd or has_process):
+            return DataQuality.SILVER
+        else:
+            return DataQuality.BRONZE
+# ============== NEW: Contextualized Extraction ==============
+CONTEXTUALIZED_EXTRACTION_PROMPT = """
+You are an expert in organic thermoelectrics and polymer science.
+Extract ALL experimental data points from the provided paper.
+## CRITICAL REQUIREMENTS
+1. **Extract ALL values, not just the best one**
+   - A paper may report multiple values under different conditions
+   - Extract EACH value as a separate data point
+2. **Include COMPLETE experimental conditions**
+   - Every value must have its associated conditions
+   - Common: temperature, annealing, doping level, measurement method
+3. **MANDATORY: Include source quote**
+   - For EACH data point, include the exact sentence from the paper
+   - Quote must be >10 characters and reference the value
+## TARGET PROPERTIES
+- `electrical_conductivity` (S/cm, S/m)
+- `thermal_conductivity` (W/mK)
+- `seebeck_coefficient` (μV/K)
+- `power_factor` (μW/mK²)
+- `zt_figure_of_merit` (dimensionless)
+## OUTPUT FORMAT (JSON Array)
+Return ONLY valid JSON, no markdown, no explanation:
+[
+  {{
+    "polymer_name": "PEDOT:PSS",
+    "dopant": "H2SO4",
+    "dopant_ratio": "5 vol%",
+    "property_name": "electrical_conductivity",
+    "raw_value": "4380",
+    "raw_unit": "S/cm",
+    "conditions": {{
+      "solvent": "water",
+      "annealing_temp_c": 150,
+      "annealing_time_min": 10,
+      "measurement_temp_k": 300,
+      "measurement_method": "4-point probe"
+    }},
+    "source_quote": "The electrical conductivity reached 4380 S/cm after H2SO4 treatment.",
+    "source_location": "Table 2, Sample S5",
+    "extraction_confidence": 0.95
+  }}
+]
+## RULES
+1. If values range "from X to Y", extract BOTH as separate points
+2. Preserve scientific notation as "5.2e3" or actual number
+3. If no source quote found, set extraction_confidence < 0.5
+4. Return ONLY valid JSON array, no other text
+---
+**PAPER CONTENT:**
+Title: {title}
+{content}
+---
+JSON output:
+"""
+class ContextualizedExtractor:
+    """
+    Contextualized data extractor.
+    Produces ContextualizedValue objects with mandatory source quotes for traceability.
+    """
+    def __init__(
+        self,
+        model_id: str = None,
+        target_properties: Optional[List[str]] = None,
+        extra_instructions: str = "",
+    ):
+        """
+        Initialize extractor.
+        Args:
+            model_id: LLM model ID to use (default from config)
+            target_properties: List of property keys to extract
+            extra_instructions: Free-form LLM instructions
+        """
+        config = get_config()
+        self.model_id = model_id or config.llm_model
+        self.openai_base_url = _normalize_base_url(config.openai_base_url)
+        self.openai_key = config.openai_api_key
+        self.target_properties = target_properties or _DEFAULT_PROPERTIES
+        self.extra_instructions = extra_instructions
+    def is_configured(self) -> bool:
+        return _is_http_url(self.openai_base_url)
+    def extract_from_paper(
+        self,
+        paper: PaperMetadata,
+        use_full_text: bool = True
+    ) -> "ExtractionResult":
+        """
+        Extract contextualized data from a paper.
+        Args:
+            paper: Paper metadata
+            use_full_text: Use PDF full text if available
+        Returns:
+            ExtractionResult with ContextualizedValue data points
+        """
+        from .schemas import ContextualizedValue, ExperimentalConditions, ExtractionResult
+        logger.info(f"Contextualized extraction for: {paper.title[:50]}...")
+        if not self.is_configured():
+            return ExtractionResult(
+                paper_id=paper.id,
+                paper_title=paper.title,
+                success=False,
+                error_message="contextual_llm_unconfigured",
+                extraction_notes="Contextualized extraction skipped because no OpenAI-compatible base URL is configured.",
+            )
+        # Prepare content
+        content = paper.full_text if use_full_text and paper.full_text else paper.abstract
+        if not content:
+            return ExtractionResult(
+                paper_id=paper.id,
+                paper_title=paper.title,
+                success=False,
+                error_message="No content available"
+            )
+        # Truncate content to fit context window
+        content = content[:15000]
+        # Build dynamic prompt from target properties
+        prompt_template = build_extraction_prompt(self.target_properties, self.extra_instructions)
+        prompt = prompt_template.replace("{title}", paper.title or "Unknown").replace("{content}", content)
+        try:
+            # Call LLM
+            raw_response = self._call_llm(prompt)
+            if not raw_response:
+                return ExtractionResult(
+                    paper_id=paper.id,
+                    paper_title=paper.title,
+                    success=False,
+                    error_message="contextual_llm_unconfigured" if not self.is_configured() else "LLM returned empty response"
+                )
+            # Parse response
+            data_points = self._parse_response(raw_response, paper.id)
+            return ExtractionResult(
+                paper_id=paper.id,
+                paper_title=paper.title,
+                data_points=data_points,
+                extraction_model=self.model_id,
+                success=True
+            )
+        except Exception as e:
+            logger.warning(f"Contextualized extraction failed for {paper.id}: {e}")
+            return ExtractionResult(
+                paper_id=paper.id,
+                paper_title=paper.title,
+                success=False,
+                error_message=str(e)
+            )
+    def _call_llm(self, prompt: str) -> Optional[str]:
+        """Call LLM via OpenAI-compatible API."""
+        import httpx
+        if not self.is_configured():
+            logger.debug("Contextualized extractor skipped: OpenAI-compatible base URL is not configured.")
+            return None
+        logger.info("Calling LLM for contextualized extraction...")
+        logger.info(f"Using model: {self.model_id}")
+        headers = {
+            "Content-Type": "application/json",
+        }
+        if self.openai_key:
+            headers["Authorization"] = f"Bearer {self.openai_key}"
+        payload = {
+            "model": self.model_id,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": 0.2,
+            "max_tokens": 3000,
+        }
+        with httpx.Client(timeout=120) as client:
+            response = client.post(
+                f"{self.openai_base_url}/chat/completions",
+                json=payload,
+                headers=headers
+            )
+            response.raise_for_status()
+            data = response.json()
+        return data["choices"][0]["message"]["content"]
+    def _parse_response(self, response: str, paper_id: str) -> List:
+        """Parse LLM response into ContextualizedValue objects."""
+        from .schemas import ContextualizedValue, ExperimentalConditions
+        try:
+            data = safe_json_loads(response)
+        except Exception as e:
+            logger.warning(f"JSON parse failed for {paper_id}: {e}")
+            return []
+        if data is None:
+            return []
+        if not isinstance(data, list):
+            data = [data]
+        results = []
+        for item in data:
+            if not isinstance(item, dict):
+                continue
+            try:
+                # Handle conditions
+                conditions_data = item.pop("conditions", {})
+                conditions = ExperimentalConditions(**conditions_data) if conditions_data else ExperimentalConditions()
+                # Ensure required fields
+                if "source_quote" not in item or not item.get("source_quote"):
+                    item["source_quote"] = f"[Extracted from {paper_id}]"
+                value = ContextualizedValue(
+                    conditions=conditions,
+                    **item
+                )
+                results.append(value)
+            except Exception as e:
+                logger.warning(f"Failed to parse data point: {e}")
+                continue
+        logger.info(f"Extracted {len(results)} contextualized data points from {paper_id}")
+        return results
+    def extract_from_papers(
+        self,
+        papers: List[PaperMetadata],
+        use_full_text: bool = True
+    ) -> List:
+        """Batch extraction from multiple papers."""
+        results = []
+        for paper in papers:
+            result = self.extract_from_paper(paper, use_full_text)
+            results.append(result)
+        return results

literature/graph.py ADDED Viewed

	@@ -0,0 +1,450 @@

+"""
+LangGraph workflow for Literature Discovery System.
+Implements: discover → download → extract → quality pipeline.
+Key design principles:
+1. All state modifications must be explicit in return values
+2. No in-place object modification
+3. Each node returns logs for UI feedback
+"""
+import logging
+from typing import TypedDict, List, Optional, Annotated, Literal, Callable, Any
+from datetime import datetime
+import operator
+from langgraph.graph import StateGraph, END, START
+from langgraph.checkpoint.memory import MemorySaver
+from .schemas import PaperMetadata, PolymerDataPoint, ExtractionResult, DataQuality
+from .discovery import PaperDiscoveryAgent
+from .extraction import DataExtractor
+from .quality import QualityAssessor
+logger = logging.getLogger(__name__)
+# ============== State Definition ==============
+class LogEntry(TypedDict):
+    """Log entry for UI feedback"""
+    timestamp: str
+    node: str
+    message: str
+    level: str  # info, warning, error
+class LiteratureState(TypedDict):
+    """
+    Workflow state.
+    Important: LangGraph state updates are based on return values.
+    If you modify a field, you MUST include it in the return dict.
+    """
+    # Input
+    search_query: str
+    max_papers: int
+    use_full_text: bool
+    # Progress tracking
+    current_node: str
+    progress_percent: int
+    # Intermediate results
+    papers: List[Any]  # List[PaperMetadata] serialized
+    downloaded_pdfs: List[str]
+    extraction_results: List[Any]  # List[ExtractionResult] serialized
+    # Final output
+    verified_data: List[Any]  # List[PolymerDataPoint] serialized
+    quality_report: Optional[dict]
+    # Logging & Status
+    logs: Annotated[List[LogEntry], operator.add]
+    status: Literal["running", "completed", "failed", "cancelled"]
+    error: Optional[str]
+def create_initial_state(
+    query: str,
+    max_papers: int = 10,
+    use_full_text: bool = False
+) -> LiteratureState:
+    """Create initial state"""
+    return {
+        "search_query": query,
+        "max_papers": max_papers,
+        "use_full_text": use_full_text,
+        "current_node": "start",
+        "progress_percent": 0,
+        "papers": [],
+        "downloaded_pdfs": [],
+        "extraction_results": [],
+        "verified_data": [],
+        "quality_report": None,
+        "logs": [],
+        "status": "running",
+        "error": None,
+    }
+# ============== Helper Functions ==============
+def _log(node: str, message: str, level: str = "info") -> LogEntry:
+    """Create log entry"""
+    return {
+        "timestamp": datetime.now().isoformat(),
+        "node": node,
+        "message": message,
+        "level": level,
+    }
+def _serialize_paper(paper: PaperMetadata) -> dict:
+    """Serialize paper for state storage"""
+    return paper.model_dump()
+def _deserialize_paper(data: dict) -> PaperMetadata:
+    """Deserialize paper from state"""
+    return PaperMetadata(**data)
+# ============== Node Functions ==============
+def discover_node(state: LiteratureState) -> dict:
+    """
+    Paper discovery node.
+    Uses existing PaperDiscoveryAgent (synchronous).
+    """
+    node_name = "discover"
+    logs = [_log(node_name, f"Searching for: '{state['search_query']}'")]
+    try:
+        agent = PaperDiscoveryAgent()
+        papers = agent.discover(
+            query=state["search_query"],
+            limit_per_source=state["max_papers"],
+        )
+        logs.append(_log(node_name, f"Found {len(papers)} unique papers"))
+        # Serialize papers for state storage
+        serialized_papers = [_serialize_paper(p) for p in papers]
+        return {
+            "papers": serialized_papers,
+            "current_node": node_name,
+            "progress_percent": 25,
+            "logs": logs,
+        }
+    except Exception as e:
+        logger.exception(f"Discover node failed: {e}")
+        logs.append(_log(node_name, f"Error: {e}", "error"))
+        return {
+            "papers": [],
+            "current_node": node_name,
+            "status": "failed",
+            "error": str(e),
+            "logs": logs,
+        }
+def download_node(state: LiteratureState) -> dict:
+    """
+    PDF download node.
+    Uses existing PDFRetriever (synchronous).
+    """
+    from .retrieval import PDFRetriever
+    node_name = "download"
+    paper_dicts = state["papers"]
+    logs = [_log(node_name, f"Downloading content for {len(paper_dicts)} papers")]
+    try:
+        # Deserialize papers
+        papers = [_deserialize_paper(p) for p in paper_dicts]
+        retriever = PDFRetriever()
+        papers = retriever.retrieve_batch(papers)
+        # Count successes
+        downloaded = [p for p in papers if p.pdf_path]
+        logs.append(_log(node_name, f"Downloaded {len(downloaded)}/{len(papers)} PDFs"))
+        # Re-serialize papers with updated pdf_path
+        serialized_papers = [_serialize_paper(p) for p in papers]
+        downloaded_pdfs = [p.pdf_path for p in downloaded if p.pdf_path]
+        return {
+            "papers": serialized_papers,
+            "downloaded_pdfs": downloaded_pdfs,
+            "current_node": node_name,
+            "progress_percent": 50,
+            "logs": logs,
+        }
+    except Exception as e:
+        logger.exception(f"Download node failed: {e}")
+        logs.append(_log(node_name, f"Error: {e}", "error"))
+        return {
+            "downloaded_pdfs": [],
+            "current_node": node_name,
+            "status": "failed",
+            "error": str(e),
+            "logs": logs,
+        }
+def extract_node(state: LiteratureState) -> dict:
+    """
+    Data extraction node.
+    Uses existing DataExtractor (synchronous).
+    """
+    node_name = "extract"
+    logs = [_log(node_name, "Extracting structured data from papers")]
+    try:
+        # Deserialize papers
+        papers = [_deserialize_paper(p) for p in state["papers"]]
+        # Filter papers with content
+        papers_with_content = [p for p in papers if p.pdf_path or p.abstract]
+        if not papers_with_content:
+            logs.append(_log(node_name, "No papers with content to extract", "warning"))
+            return {
+                "extraction_results": [],
+                "current_node": node_name,
+                "progress_percent": 75,
+                "logs": logs,
+            }
+        logs.append(_log(node_name, f"Processing {len(papers_with_content)} papers with content"))
+        extractor = DataExtractor()
+        results = extractor.extract_from_papers(
+            papers_with_content,
+            use_full_text=state["use_full_text"]
+        )
+        total_points = sum(len(r.data_points) for r in results if r.success)
+        logs.append(_log(node_name, f"Extracted {total_points} data points from {len(results)} papers"))
+        # Serialize results
+        serialized_results = []
+        for r in results:
+            serialized_results.append({
+                "paper_id": r.paper.id if r.paper else "unknown",
+                "success": r.success,
+                "error_message": r.error_message,
+                "data_points": [dp.model_dump() for dp in r.data_points] if r.data_points else [],
+            })
+        return {
+            "extraction_results": serialized_results,
+            "current_node": node_name,
+            "progress_percent": 75,
+            "logs": logs,
+        }
+    except Exception as e:
+        logger.exception(f"Extract node failed: {e}")
+        logs.append(_log(node_name, f"Error: {e}", "error"))
+        return {
+            "extraction_results": [],
+            "current_node": node_name,
+            "status": "failed",
+            "error": str(e),
+            "logs": logs,
+        }
+def quality_node(state: LiteratureState) -> dict:
+    """
+    Quality assessment node.
+    """
+    node_name = "quality"
+    logs = [_log(node_name, "Assessing data quality")]
+    try:
+        # Collect all data points from serialized results
+        all_points: List[PolymerDataPoint] = []
+        for result_dict in state["extraction_results"]:
+            if result_dict.get("success") and result_dict.get("data_points"):
+                for dp_dict in result_dict["data_points"]:
+                    try:
+                        dp = PolymerDataPoint(**dp_dict)
+                        all_points.append(dp)
+                    except Exception as e:
+                        logger.warning(f"Failed to deserialize data point: {e}")
+        if not all_points:
+            logs.append(_log(node_name, "No data points to assess", "warning"))
+            return {
+                "verified_data": [],
+                "quality_report": None,
+                "current_node": node_name,
+                "progress_percent": 100,
+                "status": "completed",
+                "logs": logs,
+            }
+        assessor = QualityAssessor()
+        verified, report = assessor.assess_batch(all_points)
+        logs.append(_log(node_name, report.summary()))
+        # Serialize
+        report_dict = {
+            "total_points": report.total_points,
+            "gold_count": report.gold_count,
+            "silver_count": report.silver_count,
+            "bronze_count": report.bronze_count,
+            "invalid_count": report.invalid_count,
+            "validation_errors": report.validation_errors,
+        }
+        verified_data = [dp.model_dump() for dp in verified]
+        return {
+            "verified_data": verified_data,
+            "quality_report": report_dict,
+            "current_node": node_name,
+            "progress_percent": 100,
+            "status": "completed",
+            "logs": logs,
+        }
+    except Exception as e:
+        logger.exception(f"Quality node failed: {e}")
+        logs.append(_log(node_name, f"Error: {e}", "error"))
+        return {
+            "verified_data": [],
+            "quality_report": None,
+            "current_node": node_name,
+            "status": "failed",
+            "error": str(e),
+            "logs": logs,
+        }
+# ============== Conditional Edges ==============
+def should_continue_after_discover(state: LiteratureState) -> str:
+    """Should continue after discovery?"""
+    if state.get("status") == "failed":
+        return "end"
+    if not state.get("papers"):
+        return "end"
+    return "download"
+def should_continue_after_download(state: LiteratureState) -> str:
+    """Should continue after download?"""
+    if state.get("status") == "failed":
+        return "end"
+    if not state.get("downloaded_pdfs") and not state.get("papers"):
+        return "end"
+    return "extract"
+def should_continue_after_extract(state: LiteratureState) -> str:
+    """Should continue after extraction?"""
+    if state.get("status") == "failed":
+        return "end"
+    # Check if any extraction succeeded
+    results = state.get("extraction_results", [])
+    total_points = sum(
+        len(r.get("data_points", []))
+        for r in results
+        if r.get("success")
+    )
+    if total_points == 0:
+        return "end"
+    return "quality"
+# ============== Graph Builder ==============
+def create_literature_graph(checkpointer=None):
+    """
+    Create the literature mining workflow graph.
+    Args:
+        checkpointer: Optional checkpoint storage (defaults to MemorySaver)
+    Returns:
+        Compiled LangGraph
+    """
+    builder = StateGraph(LiteratureState)
+    # Add nodes
+    builder.add_node("discover", discover_node)
+    builder.add_node("download", download_node)
+    builder.add_node("extract", extract_node)
+    builder.add_node("quality", quality_node)
+    # Add edges
+    builder.add_edge(START, "discover")
+    builder.add_conditional_edges(
+        "discover",
+        should_continue_after_discover,
+        {"download": "download", "end": END}
+    )
+    builder.add_conditional_edges(
+        "download",
+        should_continue_after_download,
+        {"extract": "extract", "end": END}
+    )
+    builder.add_conditional_edges(
+        "extract",
+        should_continue_after_extract,
+        {"quality": "quality", "end": END}
+    )
+    builder.add_edge("quality", END)
+    # Compile
+    if checkpointer is None:
+        checkpointer = MemorySaver()
+    graph = builder.compile(checkpointer=checkpointer)
+    return graph
+# ============== Sync Runner ==============
+def run_workflow(
+    query: str,
+    max_papers: int = 10,
+    use_full_text: bool = False,
+    thread_id: str = "default",
+    on_state_update: Optional[Callable[[LiteratureState], None]] = None,
+) -> LiteratureState:
+    """
+    Run the literature mining workflow (synchronous).
+    Args:
+        query: Search query
+        max_papers: Max papers per source
+        use_full_text: Whether to use full text extraction
+        thread_id: Thread ID for state recovery
+        on_state_update: Callback for state updates
+    Returns:
+        Final state
+    """
+    graph = create_literature_graph()
+    initial_state = create_initial_state(query, max_papers, use_full_text)
+    config = {"configurable": {"thread_id": thread_id}}
+    final_state = None
+    for event in graph.stream(initial_state, config, stream_mode="values"):
+        final_state = event
+        if on_state_update:
+            on_state_update(event)
+    return final_state

literature/property_registry.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+Property catalog and extraction prompt builder for production literature mining.
+This registry is aligned to the platform's public property keys so staged
+literature evidence can be consumed by Property Probe and Discovery without
+ad-hoc remapping.
+"""
+from __future__ import annotations
+import re
+from typing import Dict, List, Optional
+PROPERTY_CATALOG: Dict[str, Dict[str, str]] = {
+    # Thermal
+    "tm": {"name": "Melting temperature", "unit": "K"},
+    "tg": {"name": "Glass transition temperature", "unit": "K"},
+    "td": {"name": "Thermal diffusivity", "unit": "m^2/s"},
+    "tc": {"name": "Thermal conductivity", "unit": "W/(m*K)"},
+    "cp": {"name": "Specific heat capacity", "unit": "J/(kg*K)"},
+    # Mechanical
+    "young": {"name": "Young's modulus", "unit": "GPa"},
+    "shear": {"name": "Shear modulus", "unit": "GPa"},
+    "bulk": {"name": "Bulk modulus", "unit": "GPa"},
+    "poisson": {"name": "Poisson ratio", "unit": "dimensionless"},
+    # Transport
+    "visc": {"name": "Viscosity", "unit": "Pa*s"},
+    "dif": {"name": "Diffusivity", "unit": "cm^2/s"},
+    # Gas permeability
+    "phe": {"name": "He permeability", "unit": "Barrer"},
+    "ph2": {"name": "H2 permeability", "unit": "Barrer"},
+    "pco2": {"name": "CO2 permeability", "unit": "Barrer"},
+    "pn2": {"name": "N2 permeability", "unit": "Barrer"},
+    "po2": {"name": "O2 permeability", "unit": "Barrer"},
+    "pch4": {"name": "CH4 permeability", "unit": "Barrer"},
+    # Electronic / optical
+    "alpha": {"name": "Polarizability", "unit": "a.u."},
+    "homo": {"name": "HOMO energy", "unit": "eV"},
+    "lumo": {"name": "LUMO energy", "unit": "eV"},
+    "bandgap": {"name": "Band gap", "unit": "eV"},
+    "mu": {"name": "Dipole moment", "unit": "Debye"},
+    "etotal": {"name": "Total electronic energy", "unit": "eV"},
+    "ri": {"name": "Refractive index", "unit": "dimensionless"},
+    "dc": {"name": "Dielectric constant", "unit": "dimensionless"},
+    "pe": {"name": "Permittivity", "unit": "dimensionless"},
+    # Structural / physical
+    "rg": {"name": "Radius of gyration", "unit": "Angstrom"},
+    "rho": {"name": "Density", "unit": "g/cm^3"},
+    # Extended literature-only properties retained for discovery/search
+    "electrical_conductivity": {"name": "Electrical conductivity", "unit": "S/cm"},
+    "seebeck_coefficient": {"name": "Seebeck coefficient", "unit": "uV/K"},
+    "power_factor": {"name": "Power factor", "unit": "uW/(m*K^2)"},
+    "zt_figure_of_merit": {"name": "ZT figure of merit", "unit": "dimensionless"},
+    "tensile_strength": {"name": "Tensile strength", "unit": "MPa"},
+    "elongation_at_break": {"name": "Elongation at break", "unit": "%"},
+    "crystallinity": {"name": "Crystallinity", "unit": "%"},
+}
+PLATFORM_PROPERTY_KEYS = [
+    "tm", "tg", "td", "tc", "cp",
+    "young", "shear", "bulk", "poisson",
+    "visc", "dif",
+    "phe", "ph2", "pco2", "pn2", "po2", "pch4",
+    "alpha", "homo", "lumo", "bandgap", "mu", "etotal", "ri", "dc", "pe",
+    "rg", "rho",
+]
+TEMPLATES: Dict[str, List[str]] = {
+    "thermal": ["tm", "tg", "td", "tc", "cp"],
+    "mechanical": ["young", "shear", "bulk", "poisson", "tensile_strength", "elongation_at_break"],
+    "electronic": ["bandgap", "homo", "lumo", "ri", "dc", "pe", "alpha", "mu", "etotal"],
+    "gas_permeability": ["pco2", "po2", "pn2", "ph2", "phe", "pch4"],
+    "transport": ["visc", "dif", "tc", "electrical_conductivity", "seebeck_coefficient", "power_factor"],
+    "platform_core": PLATFORM_PROPERTY_KEYS,
+}
+TEMPLATE_LABELS: Dict[str, str] = {
+    "thermal": "Thermal",
+    "mechanical": "Mechanical",
+    "electronic": "Electronic / Optical",
+    "gas_permeability": "Gas Permeability",
+    "transport": "Transport / Energy",
+    "platform_core": "Platform Core",
+}
+PROPERTY_ALIASES: Dict[str, str] = {
+    "thermal conductivity": "tc",
+    "heat conductivity": "tc",
+    "thermal diffusivity": "td",
+    "heat diffusivity": "td",
+    "specific heat": "cp",
+    "heat capacity": "cp",
+    "young modulus": "young",
+    "youngs modulus": "young",
+    "young_s_modulus": "young",
+    "young_modulus": "young",
+    "shear modulus": "shear",
+    "shear_modulus": "shear",
+    "bulk modulus": "bulk",
+    "bulk_modulus": "bulk",
+    "poisson ratio": "poisson",
+    "poisson_ratio": "poisson",
+    "viscosity": "visc",
+    "diffusivity": "dif",
+    "he permeability": "phe",
+    "helium permeability": "phe",
+    "h2 permeability": "ph2",
+    "co2 permeability": "pco2",
+    "n2 permeability": "pn2",
+    "o2 permeability": "po2",
+    "ch4 permeability": "pch4",
+    "polarizability": "alpha",
+    "homo energy": "homo",
+    "lumo energy": "lumo",
+    "band gap": "bandgap",
+    "bandgap": "bandgap",
+    "dipole moment": "mu",
+    "total electronic energy": "etotal",
+    "refractive index": "ri",
+    "dielectric constant": "dc",
+    "permittivity": "pe",
+    "radius of gyration": "rg",
+    "density": "rho",
+    "electrical conductivity": "electrical_conductivity",
+    "conductivity": "electrical_conductivity",
+    "seebeck coefficient": "seebeck_coefficient",
+    "power factor": "power_factor",
+    "zt": "zt_figure_of_merit",
+    "zt figure of merit": "zt_figure_of_merit",
+    "tensile strength": "tensile_strength",
+    "elongation at break": "elongation_at_break",
+    "co2_permeability": "pco2",
+    "o2_permeability": "po2",
+    "n2_permeability": "pn2",
+    "h2_permeability": "ph2",
+    "he_permeability": "phe",
+    "ch4_permeability": "pch4",
+    "radius_of_gyration": "rg",
+    "refractive_index": "ri",
+    "dielectric_constant": "dc",
+    "dipole_moment": "mu",
+}
+def _norm(text: str) -> str:
+    normalized = re.sub(r"[^a-z0-9]+", " ", str(text or "").strip().lower())
+    return re.sub(r"\s+", " ", normalized).strip()
+for key, meta in PROPERTY_CATALOG.items():
+    PROPERTY_ALIASES.setdefault(_norm(key), key)
+    PROPERTY_ALIASES.setdefault(_norm(meta["name"]), key)
+def normalize_property_key(value: str | None) -> Optional[str]:
+    """Map free-form property text to a canonical registry key."""
+    if not value:
+        return None
+    key = PROPERTY_ALIASES.get(_norm(value))
+    if key in PROPERTY_CATALOG:
+        return key
+    return None
+def detect_property_keys(text: str) -> List[str]:
+    """Return all unique property keys that appear in the free-form text."""
+    haystack = _norm(text)
+    out: List[str] = []
+    for alias, key in PROPERTY_ALIASES.items():
+        if alias and alias in haystack and key not in out:
+            out.append(key)
+    return out
+def property_display_name(key: str) -> str:
+    meta = PROPERTY_CATALOG.get(key)
+    if not meta:
+        return key
+    return f"{meta['name']} ({meta['unit']})"
+def _property_list_block(property_keys: List[str]) -> str:
+    """Build the target-properties section of the extraction prompt."""
+    lines = []
+    for key in property_keys:
+        meta = PROPERTY_CATALOG.get(key)
+        if meta:
+            lines.append(f"- `{key}` ({meta['name']}) -- standard unit: {meta['unit']}")
+        else:
+            lines.append(f"- `{key}`")
+    return "\n".join(lines)
+def build_extraction_prompt(
+    property_keys: List[str],
+    extra_instructions: str = "",
+) -> str:
+    """
+    Build a dynamic contextualized extraction prompt from the given property list.
+    """
+    normalized_keys = [normalize_property_key(k) or k for k in property_keys if k]
+    props_block = _property_list_block(normalized_keys)
+    extra_section = ""
+    if extra_instructions.strip():
+        extra_section = f"""
+## ADDITIONAL CONTEXT
+{extra_instructions.strip()}
+"""
+    prompt = f"""You are an expert in polymer science and materials characterization.
+Extract experimentally grounded evidence records from the provided paper.
+## CRITICAL REQUIREMENTS
+1. Extract each material-property-value observation as a separate record
+2. Preserve the original value and unit exactly as written
+3. Include experimental conditions and measurement method whenever available
+4. Include a source quote and source location for every record
+5. Ignore theoretical-only values unless the paper explicitly reports an experiment-backed measurement
+## TARGET PROPERTIES
+For each data point, extract these properties:
+{props_block}
+{extra_section}
+## OUTPUT FORMAT (JSON Array)
+Return ONLY valid JSON, no markdown, no explanation:
+[
+  {{
+    "polymer_name": "P3HT",
+    "property_name": "<one of the target property keys above>",
+    "raw_value": "1.9",
+    "raw_unit": "eV",
+    "conditions": {{
+      "solvent": "chloroform",
+      "annealing_temp_c": 150,
+      "annealing_time_min": 10,
+      "measurement_temp_k": 300,
+      "measurement_method": "UV-Vis"
+    }},
+    "source_quote": "The optical band gap of P3HT was determined to be 1.9 eV from the UV-Vis absorption onset.",
+    "source_location": "Table 1",
+    "extraction_confidence": 0.95
+  }}
+]
+## RULES
+1. If values range "from X to Y", extract BOTH as separate points
+2. Preserve scientific notation as "5.2e3" or actual number
+3. If no source quote is available, lower extraction_confidence below 0.5
+4. Prefer experimentally measured values over model predictions or simulations
+5. Return ONLY a valid JSON array, no extra text
+---
+**PAPER CONTENT:**
+Title: {{title}}
+{{content}}
+---
+JSON output:
+"""
+    return prompt

literature/quality.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+Production quality assessment and validation for literature evidence.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+from .schemas import ContextualizedValue, DataQuality, PolymerDataPoint
+logger = logging.getLogger(__name__)
+@dataclass
+class QualityReport:
+    """Data quality report for a batch of data points."""
+    total_points: int
+    gold_count: int
+    silver_count: int
+    bronze_count: int
+    invalid_count: int
+    validation_errors: List[str]
+    @property
+    def gold_ratio(self) -> float:
+        return self.gold_count / max(self.total_points, 1)
+    def summary(self) -> str:
+        return (
+            f"Quality Report: {self.total_points} points\n"
+            f"  Gold: {self.gold_count} ({self.gold_ratio:.1%})\n"
+            f"  Silver: {self.silver_count}\n"
+            f"  Bronze: {self.bronze_count}\n"
+            f"  Invalid: {self.invalid_count}\n"
+            f"  Errors: {len(self.validation_errors)}"
+        )
+class QualityAssessor:
+    """Quality assessor with property-aware sanity checks."""
+    PROPERTY_BOUNDS: Dict[str, Tuple[Optional[float], Optional[float]]] = {
+        "tm": (50, 2000),
+        "tg": (50, 2000),
+        "td": (1e-10, 1.0),
+        "tc": (1e-4, 1000.0),
+        "cp": (1.0, 1e7),
+        "young": (1e-6, 1e5),
+        "shear": (1e-6, 1e5),
+        "bulk": (1e-6, 1e5),
+        "poisson": (-1.0, 0.5),
+        "visc": (1e-9, 1e9),
+        "dif": (1e-12, 10.0),
+        "rho": (1e-6, 100.0),
+        "ri": (0.5, 10.0),
+        "bandgap": (-20.0, 20.0),
+        "homo": (-30.0, 10.0),
+        "lumo": (-30.0, 20.0),
+        "mu": (0.0, 1e4),
+        "electrical_conductivity": (1e-12, 1e8),
+        "seebeck_coefficient": (-1e5, 1e5),
+        "power_factor": (0.0, 1e9),
+        "zt_figure_of_merit": (0.0, 1e4),
+    }
+    def __init__(self) -> None:
+        self.errors: List[str] = []
+    def assess_batch(self, data_points: List[PolymerDataPoint]) -> Tuple[List[PolymerDataPoint], QualityReport]:
+        """Legacy compatibility path used by older scripts."""
+        self.errors = []
+        valid_points: List[PolymerDataPoint] = []
+        gold_count = silver_count = bronze_count = invalid_count = 0
+        for dp in data_points:
+            is_valid, error_msg = self._validate_legacy(dp)
+            if not is_valid:
+                self.errors.append(f"{dp.source_paper_id}: {error_msg}")
+                invalid_count += 1
+                continue
+            dp.quality_tier = self._compute_legacy_quality_tier(dp)
+            if dp.quality_tier == DataQuality.GOLD:
+                gold_count += 1
+            elif dp.quality_tier == DataQuality.SILVER:
+                silver_count += 1
+            else:
+                bronze_count += 1
+            valid_points.append(dp)
+        report = QualityReport(
+            total_points=len(data_points),
+            gold_count=gold_count,
+            silver_count=silver_count,
+            bronze_count=bronze_count,
+            invalid_count=invalid_count,
+            validation_errors=self.errors.copy(),
+        )
+        logger.info(report.summary())
+        return valid_points, report
+    def validate_contextual_value(self, value: ContextualizedValue) -> Tuple[bool, Optional[str]]:
+        if not value.polymer_name or value.polymer_name.strip().lower() == "unknown":
+            return False, "Missing material name"
+        if not value.property_name:
+            return False, "Missing property key"
+        if value.standardized_value is None:
+            return False, "Missing standardized value"
+        if not value.source_quote or len(value.source_quote.strip()) < 10:
+            return False, "Missing source quote"
+        bounds = self.PROPERTY_BOUNDS.get(value.property_name)
+        if bounds is None:
+            return True, None
+        low, high = bounds
+        numeric = value.standardized_value
+        if low is not None and numeric < low:
+            return False, f"Value below plausible range: {numeric}"
+        if high is not None and numeric > high:
+            return False, f"Value above plausible range: {numeric}"
+        return True, None
+    def assess_contextual_quality(self, value: ContextualizedValue) -> DataQuality:
+        score = 0
+        if value.standardized_value is not None:
+            score += 2
+        if value.conditions.to_dict():
+            score += min(len(value.conditions.to_dict()), 3)
+        if value.conditions.measurement_method:
+            score += 1
+        if value.source_location:
+            score += 1
+        if value.extraction_confidence >= 0.9:
+            score += 2
+        elif value.extraction_confidence >= 0.7:
+            score += 1
+        if score >= 7:
+            return DataQuality.GOLD
+        if score >= 4:
+            return DataQuality.SILVER
+        return DataQuality.BRONZE
+    def _validate_legacy(self, dp: PolymerDataPoint) -> Tuple[bool, Optional[str]]:
+        if not dp.polymer_name or dp.polymer_name == "Unknown":
+            return False, "Missing polymer name"
+        has_measurement = any([
+            dp.electrical_conductivity_s_cm is not None,
+            dp.thermal_conductivity_w_mk is not None,
+            dp.seebeck_coefficient_uv_k is not None,
+        ])
+        if not has_measurement:
+            return False, "No measurement values"
+        return True, None
+    def _compute_legacy_quality_tier(self, dp: PolymerDataPoint) -> DataQuality:
+        score = 0
+        if dp.electrical_conductivity_s_cm is not None:
+            score += 3
+        if dp.seebeck_coefficient_uv_k is not None:
+            score += 2
+        if dp.power_factor_uw_m_k2 is not None:
+            score += 1
+        if dp.thermal_conductivity_w_mk is not None:
+            score += 4
+        if dp.source_table_or_figure:
+            score += 1
+        if dp.annealing_temp_c is not None:
+            score += 1
+        if score >= 7:
+            return DataQuality.GOLD
+        if score >= 4:
+            return DataQuality.SILVER
+        return DataQuality.BRONZE

literature/retrieval.py ADDED Viewed

	@@ -0,0 +1,398 @@

+"""
+PDF retrieval module.
+Downloads papers from ArXiv (priority) and via Unpaywall.
+Implements robust header spoofing and graceful error handling.
+"""
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Optional, List
+import requests
+from .schemas import PaperMetadata, PaperSource
+from .config import get_config
+logger = logging.getLogger(__name__)
+class PDFRetriever:
+    """
+    PDF retrieval with robust error handling.
+    Priority:
+    1. ArXiv (direct, free, reliable)
+    2. Existing pdf_url from metadata
+    3. Unpaywall via DOI
+    """
+    def __init__(self) -> None:
+        config = get_config()
+        self.storage_dir = Path(config.pdf_storage_dir)
+        self.storage_dir.mkdir(parents=True, exist_ok=True)
+        # Robust headers to avoid 403
+        self.headers = {
+            "User-Agent": config.user_agent,
+            "Accept": "application/pdf,*/*",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Accept-Encoding": "gzip, deflate, br",
+            "Connection": "keep-alive",
+        }
+        self.timeout = 60  # seconds
+        self.unpaywall_email = config.pubmed_email
+    def retrieve_batch(
+        self,
+        papers: List[PaperMetadata],
+        skip_existing: bool = True
+    ) -> List[PaperMetadata]:
+        """
+        Download PDFs for a batch of papers.
+        Updates paper.pdf_path for successful downloads.
+        Saves all papers and failed downloads to CSVs.
+        Args:
+            papers: List of paper metadata
+            skip_existing: Skip if PDF already exists
+        Returns:
+            Updated list of papers with pdf_path set where successful
+        """
+        successful_ids: set = set()
+        failed_papers: List[PaperMetadata] = []
+        for paper in papers:
+            try:
+                pdf_path = self.retrieve_single(paper, skip_existing=skip_existing)
+                if pdf_path:
+                    paper.pdf_path = pdf_path
+                    successful_ids.add(paper.id)
+                else:
+                    failed_papers.append(paper)
+            except Exception as e:
+                logger.warning(f"PDF retrieval failed for {paper.id}: {e}")
+                failed_papers.append(paper)
+        logger.info(f"PDF retrieval complete: {len(successful_ids)} successful, {len(failed_papers)} failed")
+        # Save all papers with download status
+        self._save_all_papers(papers, successful_ids)
+        # Save failed downloads for manual retrieval
+        if failed_papers:
+            self._save_failed_downloads(failed_papers)
+        return papers
+    def _save_failed_downloads(self, papers: List[PaperMetadata]) -> None:
+        """Save failed downloads to CSV for manual retrieval."""
+        import csv
+        from datetime import datetime
+        csv_path = self.storage_dir / "failed_downloads.csv"
+        file_exists = csv_path.exists()
+        with open(csv_path, "a", newline="", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            # Write header if new file
+            if not file_exists:
+                writer.writerow([
+                    "timestamp", "paper_id", "title", "source", "doi", "url", "expected_filename"
+                ])
+            timestamp = datetime.now().isoformat()
+            for paper in papers:
+                safe_id = paper.id.replace("/", "_").replace(":", "_")
+                expected_filename = f"{safe_id}.pdf"
+                writer.writerow([
+                    timestamp,
+                    paper.id,
+                    paper.title[:100],  # Truncate long titles
+                    paper.source.value,
+                    paper.doi or "",
+                    paper.url or "",
+                    expected_filename
+                ])
+        logger.info(f"Saved {len(papers)} failed downloads to {csv_path}")
+    def _save_all_papers(
+        self,
+        papers: List[PaperMetadata],
+        successful_ids: set
+    ) -> None:
+        """Save all discovered papers to CSV with download status."""
+        import csv
+        from datetime import datetime
+        csv_path = self.storage_dir / "all_papers.csv"
+        with open(csv_path, "w", newline="", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                "paper_id", "title", "source", "year", "doi", "url",
+                "pdf_downloaded", "pdf_path", "timestamp"
+            ])
+            timestamp = datetime.now().isoformat()
+            for paper in papers:
+                downloaded = paper.id in successful_ids or paper.pdf_path is not None
+                writer.writerow([
+                    paper.id,
+                    paper.title[:150],
+                    paper.source.value,
+                    paper.year or "",
+                    paper.doi or "",
+                    paper.url or "",
+                    "YES" if downloaded else "NO",
+                    paper.pdf_path or "",
+                    timestamp
+                ])
+        logger.info(f"Saved {len(papers)} papers to {csv_path}")
+    def retrieve_single(
+        self,
+        paper: PaperMetadata,
+        skip_existing: bool = True
+    ) -> Optional[str]:
+        """
+        Download PDF for a single paper.
+        Args:
+            paper: Paper metadata
+            skip_existing: Skip if file already exists
+        Returns:
+            Path to downloaded PDF, or None if failed
+        """
+        # Determine filename
+        safe_id = paper.id.replace("/", "_").replace(":", "_")
+        pdf_filename = f"{safe_id}.pdf"
+        pdf_path = self.storage_dir / pdf_filename
+        # Check if already exists
+        if skip_existing and pdf_path.exists():
+            logger.debug(f"PDF already exists: {pdf_path}")
+            return str(pdf_path)
+        # Try download methods in priority order
+        pdf_url = self._get_pdf_url(paper)
+        if pdf_url:
+            success = self._download_pdf(pdf_url, pdf_path)
+            if success:
+                logger.info(f"Downloaded PDF: {pdf_path}")
+                return str(pdf_path)
+        logger.warning(f"Could not download PDF for {paper.id}")
+        return None
+    def _get_pdf_url(self, paper: PaperMetadata) -> Optional[str]:
+        """
+        Get PDF URL using priority order:
+        1. ArXiv direct link
+        2. PubMed Central (PMC) for PubMed papers
+        3. Existing pdf_url from metadata
+        4. Unpaywall via DOI
+        """
+        # Priority 1: ArXiv (most reliable, free)
+        if paper.source == PaperSource.ARXIV:
+            arxiv_id = paper.id.replace("arxiv_", "")
+            return f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+        # Priority 2: PubMed - try PMC first
+        if paper.source == PaperSource.PUBMED:
+            pmc_url = self._get_pmc_pdf_url(paper)
+            if pmc_url:
+                return pmc_url
+        # Priority 3: Use existing pdf_url if available
+        if paper.pdf_url:
+            return paper.pdf_url
+        # Priority 4: Try Unpaywall via DOI (works for all sources)
+        if paper.doi:
+            unpaywall_url = self._get_unpaywall_url(paper.doi)
+            if unpaywall_url:
+                return unpaywall_url
+        return None
+    def _get_pmc_pdf_url(self, paper: PaperMetadata) -> Optional[str]:
+        """
+        Try to get PDF from PubMed Central (PMC).
+        PMC provides free full-text PDFs for many PubMed articles.
+        """
+        try:
+            pmid = paper.id.replace("pubmed_", "")
+            # Try elink to get PMC ID
+            from Bio import Entrez
+            handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid)
+            record = Entrez.read(handle)
+            handle.close()
+            # Check if PMC ID exists
+            link_sets = record[0].get("LinkSetDb", [])
+            for link_set in link_sets:
+                if link_set.get("DbTo") == "pmc":
+                    links = link_set.get("Link", [])
+                    if links:
+                        pmc_id = links[0]["Id"]
+                        # PMC PDF URL format
+                        return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/pdf/"
+            return None
+        except Exception as e:
+            logger.debug(f"PMC lookup failed for {paper.id}: {e}")
+            return None
+    def _get_unpaywall_url(self, doi: str) -> Optional[str]:
+        """
+        Query Unpaywall API for open-access PDF URL.
+        Args:
+            doi: Paper DOI
+        Returns:
+            PDF URL if found, None otherwise
+        """
+        try:
+            url = f"https://api.unpaywall.org/v2/{doi}"
+            params = {"email": self.unpaywall_email}
+            response = requests.get(
+                url,
+                params=params,
+                headers=self.headers,
+                timeout=30
+            )
+            if response.status_code != 200:
+                logger.debug(f"Unpaywall returned {response.status_code} for {doi}")
+                return None
+            data = response.json()
+            # Check for best open access location
+            best_oa = data.get("best_oa_location")
+            if best_oa and best_oa.get("url_for_pdf"):
+                return best_oa["url_for_pdf"]
+            # Check all OA locations
+            oa_locations = data.get("oa_locations", [])
+            for loc in oa_locations:
+                if loc.get("url_for_pdf"):
+                    return loc["url_for_pdf"]
+            return None
+        except Exception as e:
+            logger.debug(f"Unpaywall query failed for {doi}: {e}")
+            return None
+    def _download_pdf(self, url: str, save_path: Path) -> bool:
+        """
+        Download PDF from URL with robust error handling.
+        Args:
+            url: PDF URL
+            save_path: Local path to save file
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            logger.debug(f"Downloading PDF from: {url}")
+            response = requests.get(
+                url,
+                headers=self.headers,
+                timeout=self.timeout,
+                stream=True,
+                allow_redirects=True
+            )
+            # Check for success
+            if response.status_code != 200:
+                logger.warning(f"Download failed with status {response.status_code}: {url}")
+                return False
+            # Verify it's a PDF (check content-type or magic bytes)
+            content_type = response.headers.get("content-type", "").lower()
+            if "pdf" not in content_type and "octet-stream" not in content_type:
+                # Check magic bytes as fallback
+                first_bytes = response.content[:8]
+                if not first_bytes.startswith(b"%PDF"):
+                    logger.warning(f"Response is not a PDF: {content_type}")
+                    return False
+            # Save to file
+            with open(save_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            # Verify file was written
+            if save_path.exists() and save_path.stat().st_size > 0:
+                return True
+            return False
+        except requests.exceptions.Timeout:
+            logger.warning(f"Download timeout: {url}")
+            return False
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Download error: {e}")
+            return False
+        except Exception as e:
+            logger.error(f"Unexpected error downloading {url}: {e}")
+            return False
+def extract_text_from_pdf(pdf_path: str, max_pages: int = 100) -> Optional[str]:
+    """
+    Extract text from PDF using pymupdf.
+    Args:
+        pdf_path: Path to PDF file
+        max_pages: Maximum pages to extract (default 5)
+    Returns:
+        Extracted text, or None if failed
+    """
+    try:
+        import pymupdf  # fitz
+    except ImportError:
+        try:
+            import fitz as pymupdf
+        except ImportError:
+            logger.error("pymupdf not installed. Run: pip install pymupdf")
+            return None
+    try:
+        doc = pymupdf.open(pdf_path)
+        text_parts: List[str] = []
+        pages_to_extract = min(len(doc), max_pages)
+        for page_num in range(pages_to_extract):
+            page = doc[page_num]
+            text = page.get_text()
+            if text:
+                text_parts.append(f"--- Page {page_num + 1} ---\n{text}")
+        doc.close()
+        full_text = "\n\n".join(text_parts)
+        logger.info(f"Extracted {len(full_text)} chars from {pages_to_extract} pages of {pdf_path}")
+        return full_text if full_text.strip() else None
+    except Exception as e:
+        logger.error(f"PDF text extraction failed for {pdf_path}: {e}")
+        return None

literature/schemas.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""
+Domain-specific data models for literature mining.
+Supports contextualized extraction with source traceability.
+"""
+from typing import Optional, List, Dict, Any
+from pydantic import BaseModel, Field, field_validator, model_validator, ConfigDict
+from datetime import datetime
+from enum import Enum
+class DataQuality(str, Enum):
+    """Data quality tier."""
+    GOLD = "gold"       # Complete data with source quote
+    SILVER = "silver"   # Partial data with source
+    BRONZE = "bronze"   # Limited data or no source
+    ERROR = "error"     # Extraction failed
+class QueryMode(str, Enum):
+    """High-level search entrypoint modes."""
+    MATERIAL = "material-first"
+    PROPERTY = "property-first"
+    TASK = "task-first"
+class ReviewStatus(str, Enum):
+    """Human review status for staged evidence."""
+    PENDING = "pending"
+    APPROVED = "approved"
+    REJECTED = "rejected"
+class PaperSource(str, Enum):
+    """Paper source identifier."""
+    PUBMED = "pubmed"
+    ARXIV = "arxiv"
+    SEMANTIC_SCHOLAR = "s2"
+    MANUAL = "manual"
+    UNKNOWN = "unknown"
+class PaperMetadata(BaseModel):
+    """Paper metadata from discovery."""
+    id: str = Field(..., description="Unique ID, format: {source}_{original_id}")
+    title: str
+    authors: List[str] = Field(default_factory=list)
+    year: Optional[int] = None
+    doi: Optional[str] = None
+    abstract: Optional[str] = None
+    venue: Optional[str] = None
+    citation_count: Optional[int] = None
+    is_open_access: Optional[bool] = None
+    source: PaperSource = PaperSource.UNKNOWN
+    url: Optional[str] = None
+    landing_url: Optional[str] = None
+    pdf_url: Optional[str] = None
+    pdf_path: Optional[str] = None
+    full_text: Optional[str] = None
+    match_reasons: List[str] = Field(default_factory=list)
+    background_status: Optional[str] = None
+    retrieved_at: datetime = Field(default_factory=datetime.now)
+    @field_validator('id')
+    @classmethod
+    def validate_id_format(cls, v: str) -> str:
+        """Ensure ID format is correct."""
+        valid_prefixes = ['pubmed_', 'arxiv_', 's2_', 'manual_']
+        if not any(v.startswith(p) for p in valid_prefixes):
+            raise ValueError(f"ID must start with one of {valid_prefixes}")
+        return v
+class LiteratureQuerySpec(BaseModel):
+    """Normalized query payload used by the production literature UI."""
+    mode: QueryMode
+    user_query: str
+    polymer_name: Optional[str] = None
+    canonical_smiles: Optional[str] = None
+    property_key: Optional[str] = None
+    project_id: Optional[str] = None
+    top_k_extract: int = Field(default=10, ge=1, le=50)
+    result_limit: int = Field(default=15, ge=1, le=100)
+class PaperCardResult(BaseModel):
+    """User-facing paper card summary."""
+    paper_id: str
+    title: str
+    year: Optional[int] = None
+    venue: Optional[str] = None
+    doi: Optional[str] = None
+    landing_url: Optional[str] = None
+    pdf_url: Optional[str] = None
+    is_open_access: bool = False
+    match_reasons: List[str] = Field(default_factory=list)
+    background_status: str = "discovered"
+class LiteratureSupportSummary(BaseModel):
+    """Aggregated evidence coverage for a material/property view."""
+    matched_paper_count: int = 0
+    oa_paper_count: int = 0
+    evidence_record_count: int = 0
+    approved_record_count: int = 0
+    has_experimental_evidence: bool = False
+    literature_support_score: int = Field(default=0, ge=0, le=100)
+class LiteratureEvidenceRecord(BaseModel):
+    """Production staging record for extracted literature evidence."""
+    id: Optional[str] = None
+    project_id: Optional[str] = None
+    paper_id: str
+    material_name: str
+    canonical_smiles: Optional[str] = None
+    property_key: str
+    raw_value: str
+    raw_unit: str
+    standardized_value: Optional[float] = None
+    standardized_unit: Optional[str] = None
+    conditions_json: Dict[str, Any] = Field(default_factory=dict)
+    method: Optional[str] = None
+    evidence_quote: str
+    evidence_location: Optional[str] = None
+    extractor_version: str
+    extraction_model: Optional[str] = None
+    extraction_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
+    quality_tier: DataQuality = DataQuality.BRONZE
+    review_status: ReviewStatus = ReviewStatus.PENDING
+    reviewer_note: Optional[str] = None
+    edited_payload_json: Optional[Dict[str, Any]] = None
+    created_at: Optional[str] = None
+    updated_at: Optional[str] = None
+    @field_validator("evidence_quote")
+    @classmethod
+    def validate_evidence_quote(cls, v: str) -> str:
+        text = str(v or "").strip()
+        if len(text) < 10:
+            raise ValueError("evidence_quote must be at least 10 characters")
+        return text
+# ============== Experimental Conditions ==============
+class ExperimentalConditions(BaseModel):
+    """
+    Experimental conditions with full context.
+    ⚠️ extra="allow" keeps LLM-returned fields like humidity, substrate, etc.
+    """
+    model_config = ConfigDict(extra="allow")
+    # Preparation conditions
+    solvent: Optional[str] = None
+    concentration_mg_ml: Optional[float] = None
+    spin_speed_rpm: Optional[int] = None
+    spin_time_s: Optional[int] = None
+    annealing_temp_c: Optional[float] = None
+    annealing_time_min: Optional[float] = None
+    annealing_atmosphere: Optional[str] = None
+    film_thickness_nm: Optional[float] = None
+    # Measurement conditions
+    measurement_temp_k: Optional[float] = Field(None, description="Measurement temperature (K)")
+    measurement_method: Optional[str] = None
+    measurement_direction: Optional[str] = None  # in-plane, cross-plane
+    def to_dict(self) -> dict:
+        """Convert to dict, excluding None values."""
+        return {k: v for k, v in self.model_dump().items() if v is not None}
+# ============== Contextualized Value ==============
+class ContextualizedValue(BaseModel):
+    """
+    Measurement value with full experimental context and source traceability.
+    Design principles:
+    - Same paper may report multiple values under different conditions
+    - Each value MUST have its associated experimental conditions
+    - MANDATORY: source_quote for traceability
+    """
+    model_config = ConfigDict(extra="allow")
+    # Material
+    polymer_name: str = Field(..., description="Polymer name e.g. PEDOT:PSS")
+    dopant: Optional[str] = None
+    dopant_ratio: Optional[str] = None
+    # Property measured
+    property_name: str = Field(..., description="Property name e.g. electrical_conductivity")
+    # Raw value
+    raw_value: str = Field(..., description="Raw value string from paper")
+    raw_unit: str = Field(..., description="Original unit from paper")
+    # Standardized value (filled by Standardizer)
+    standardized_value: Optional[float] = None
+    standardized_unit: Optional[str] = None
+    standardization_error: Optional[str] = None
+    # Experimental conditions
+    conditions: ExperimentalConditions = Field(default_factory=ExperimentalConditions)
+    # Source traceability (MANDATORY!)
+    source_quote: str = Field(..., description="Exact quote from paper containing this value")
+    source_location: Optional[str] = Field(None, description="Table 1, Figure 3a, etc.")
+    # Quality
+    extraction_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
+    quality_tier: DataQuality = DataQuality.BRONZE
+    @field_validator('source_quote')
+    @classmethod
+    def quote_not_empty(cls, v: str) -> str:
+        if not v or len(v.strip()) < 10:
+            raise ValueError("source_quote must be >10 chars")
+        return v.strip()
+    def to_db_dict(self) -> dict:
+        """Convert to database storage format."""
+        return {
+            "polymer_name": self.polymer_name,
+            "dopant": self.dopant,
+            "dopant_ratio": self.dopant_ratio,
+            "property_name": self.property_name,
+            "raw_value": self.raw_value,
+            "raw_unit": self.raw_unit,
+            "standardized_value": self.standardized_value,
+            "standardized_unit": self.standardized_unit,
+            "conditions": self.conditions.to_dict(),
+            "source_quote": self.source_quote,
+            "source_location": self.source_location,
+            "extraction_confidence": self.extraction_confidence,
+            "quality_tier": self.quality_tier.value,
+        }
+# ============== Legacy PolymerDataPoint (for compatibility) ==============
+class PolymerDataPoint(BaseModel):
+    """Single data point extracted from literature (legacy format)."""
+    # Material Information
+    polymer_name: str = Field(..., description="Polymer name, e.g. P3HT, PEDOT:PSS")
+    polymer_class: Optional[str] = Field(None, description="Polymer class")
+    dopant: Optional[str] = None
+    dopant_ratio: Optional[str] = None
+    # Processing Conditions
+    solvent: Optional[str] = None
+    concentration_mg_ml: Optional[float] = None
+    spin_speed_rpm: Optional[int] = None
+    spin_time_s: Optional[int] = None
+    annealing_temp_c: Optional[float] = None
+    annealing_time_min: Optional[float] = None
+    annealing_atmosphere: Optional[str] = None
+    film_thickness_nm: Optional[float] = None
+    # Electrical Properties
+    electrical_conductivity_s_cm: Optional[float] = None
+    seebeck_coefficient_uv_k: Optional[float] = None
+    power_factor_uw_m_k2: Optional[float] = None
+    # Thermal Properties
+    thermal_conductivity_w_mk: Optional[float] = None
+    zt_figure_of_merit: Optional[float] = None
+    # Structural
+    xrd_crystallinity_percent: Optional[float] = None
+    xrd_pi_stacking_angstrom: Optional[float] = None
+    xrd_lamellar_spacing_angstrom: Optional[float] = None
+    # Metadata
+    source_paper_id: str
+    source_table_or_figure: Optional[str] = None
+    extraction_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
+    quality_tier: DataQuality = DataQuality.BRONZE
+    raw_text_snippet: Optional[str] = None
+    @field_validator('electrical_conductivity_s_cm', 'thermal_conductivity_w_mk', mode='before')
+    @classmethod
+    def validate_positive(cls, v: Any) -> Optional[float]:
+        if v is not None and isinstance(v, (int, float)) and v < 0:
+            return None
+        return v
+# ============== Extraction Result ==============
+class ExtractionResult(BaseModel):
+    """
+    Extraction result for a single paper.
+    Supports both old format (paper=PaperMetadata) and new format (paper_id, paper_title).
+    """
+    model_config = ConfigDict(extra="allow")
+    # New format fields (preferred)
+    paper_id: Optional[str] = None
+    paper_title: Optional[str] = None
+    # Old format field (for backward compatibility)
+    paper: Optional[PaperMetadata] = None
+    # Common fields
+    data_points: List = Field(default_factory=list)  # Can be ContextualizedValue or PolymerDataPoint
+    extraction_model: str = "unknown"
+    extraction_timestamp: Any = Field(default_factory=lambda: datetime.now().isoformat())
+    success: bool = True
+    error_message: Optional[str] = None
+    # Legacy fields
+    llm_model_used: Optional[str] = None
+    extraction_notes: Optional[str] = None
+    @model_validator(mode='after')
+    def extract_paper_fields(self):
+        """Extract paper_id and paper_title from paper if not provided."""
+        if self.paper is not None:
+            if self.paper_id is None:
+                self.paper_id = self.paper.id
+            if self.paper_title is None:
+                self.paper_title = self.paper.title
+            # Copy llm_model_used to extraction_model if present
+            if self.llm_model_used and self.extraction_model == "unknown":
+                self.extraction_model = self.llm_model_used
+        return self

literature/standardizer.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""
+Unit standardization for production literature evidence.
+The standard units are aligned with the platform property catalog so extracted
+evidence can be compared and filtered consistently before human review.
+"""
+import logging
+import re
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional
+from .property_registry import PROPERTY_CATALOG
+logger = logging.getLogger(__name__)
+@dataclass
+class StandardizationResult:
+    """Standardization result."""
+    success: bool
+    value: Optional[float] = None
+    unit: Optional[str] = None
+    error: Optional[str] = None
+def normalize_minus_signs(s: str) -> str:
+    """Normalize all Unicode minus signs to ASCII hyphen-minus."""
+    minus_chars = [
+        "−", "–", "—", "‐", "‑", "‒", "⁻", "₋", "➖",
+    ]
+    for char in minus_chars:
+        s = s.replace(char, "-")
+    return s
+def _identity(value: float) -> float:
+    return value
+def _mul(factor: float) -> Callable[[float], float]:
+    return lambda value: value * factor
+def _add(delta: float) -> Callable[[float], float]:
+    return lambda value: value + delta
+class UnitStandardizer:
+    """Convert raw values from papers to platform-standard units."""
+    STANDARD_UNITS = {key: meta["unit"] for key, meta in PROPERTY_CATALOG.items()}
+    UNIT_ALIASES = {
+        # Temperature
+        "k": "K",
+        "kelvin": "K",
+        "c": "C",
+        "°c": "C",
+        "deg c": "C",
+        "celsius": "C",
+        # Thermal
+        "w/mk": "W/(m*K)",
+        "w/(m·k)": "W/(m*K)",
+        "w m-1 k-1": "W/(m*K)",
+        "w·m⁻¹·k⁻¹": "W/(m*K)",
+        "mw/(m*k)": "mW/(m*K)",
+        "mw/(m·k)": "mW/(m*K)",
+        "j/kgk": "J/(kg*K)",
+        "j/(kg·k)": "J/(kg*K)",
+        "j/(kg*k)": "J/(kg*K)",
+        "j/gk": "J/(g*K)",
+        "j/(g*k)": "J/(g*K)",
+        # Mechanical
+        "gpa": "GPa",
+        "mpa": "MPa",
+        # Transport / physical
+        "pa s": "Pa*s",
+        "pa·s": "Pa*s",
+        "pas": "Pa*s",
+        "mpa*s": "mPa*s",
+        "cm2/s": "cm^2/s",
+        "cm^2/s": "cm^2/s",
+        "mm2/s": "mm^2/s",
+        "mm^2/s": "mm^2/s",
+        "g/cm3": "g/cm^3",
+        "g/cm^3": "g/cm^3",
+        "kg/m3": "kg/m^3",
+        "kg/m^3": "kg/m^3",
+        "ang": "Angstrom",
+        "angstrom": "Angstrom",
+        "å": "Angstrom",
+        "nm": "nm",
+        # Electronics
+        "ev": "eV",
+        "a.u.": "a.u.",
+        "au": "a.u.",
+        "debye": "Debye",
+        # Gas / transport
+        "barrer": "Barrer",
+        # Extended literature properties
+        "s/cm": "S/cm",
+        "s m-1": "S/m",
+        "s/m": "S/m",
+        "uv/k": "uV/K",
+        "μv/k": "uV/K",
+        "µv/k": "uV/K",
+        "mv/k": "mV/K",
+        "uw/(m*k^2)": "uW/(m*K^2)",
+        "uw/(m*k**2)": "uW/(m*K^2)",
+        "uw/(m·k²)": "uW/(m*K^2)",
+        "mw/(m*k^2)": "mW/(m*K^2)",
+        "%": "%",
+        "dimensionless": "",
+        "-": "",
+        "": "",
+    }
+    CONVERSIONS: Dict[str, Dict[tuple[str, str], Callable[[float], float]]] = {
+        "tm": {("C", "K"): _add(273.15)},
+        "tg": {("C", "K"): _add(273.15)},
+        "cp": {("J/(g*K)", "J/(kg*K)"): _mul(1000.0)},
+        "tc": {("mW/(m*K)", "W/(m*K)"): _mul(0.001)},
+        "young": {("MPa", "GPa"): _mul(0.001)},
+        "shear": {("MPa", "GPa"): _mul(0.001)},
+        "bulk": {("MPa", "GPa"): _mul(0.001)},
+        "visc": {("mPa*s", "Pa*s"): _mul(0.001)},
+        "dif": {("mm^2/s", "cm^2/s"): _mul(0.01)},
+        "rho": {("kg/m^3", "g/cm^3"): _mul(0.001)},
+        "rg": {("nm", "Angstrom"): _mul(10.0)},
+        "electrical_conductivity": {("S/m", "S/cm"): _mul(0.01)},
+        "seebeck_coefficient": {("mV/K", "uV/K"): _mul(1000.0)},
+        "power_factor": {("mW/(m*K^2)", "uW/(m*K^2)"): _mul(1000.0)},
+    }
+    def standardize(
+        self,
+        property_name: str,
+        raw_value: str,
+        raw_unit: str,
+    ) -> StandardizationResult:
+        try:
+            numeric = self._parse_numeric(raw_value)
+        except ValueError as exc:
+            return StandardizationResult(success=False, error=f"Parse error: {exc}")
+        standard_unit = self.STANDARD_UNITS.get(property_name)
+        if standard_unit is None:
+            return StandardizationResult(success=False, error=f"Unknown property: {property_name}")
+        normalized = self._normalize_unit(raw_unit)
+        if standard_unit in {"dimensionless", ""}:
+            return StandardizationResult(success=True, value=numeric, unit="")
+        if normalized == standard_unit:
+            return StandardizationResult(success=True, value=numeric, unit=standard_unit)
+        transform = self.CONVERSIONS.get(property_name, {}).get((normalized, standard_unit))
+        if transform is not None:
+            return StandardizationResult(success=True, value=transform(numeric), unit=standard_unit)
+        if normalized == "":
+            return StandardizationResult(success=False, error=f"Missing unit for {property_name}")
+        return StandardizationResult(
+            success=False,
+            error=f"Cannot convert {normalized} to {standard_unit} for {property_name}",
+        )
+    def _parse_numeric(self, value_str: str) -> float:
+        s = normalize_minus_signs(str(value_str or "").strip())
+        s = re.sub(r"\s*[×x]\s*10\^?\s*(-?\d+)", r"e\1", s)
+        superscripts = {
+            "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4",
+            "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "⁻": "-",
+        }
+        for sup, norm in superscripts.items():
+            s = s.replace(sup, norm)
+        s = s.replace(" ", "")
+        range_match = re.match(r"^(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)$", s)
+        if range_match:
+            low = float(range_match.group(1))
+            high = float(range_match.group(2))
+            return (low + high) / 2
+        pm_match = re.match(r"^([\d.eE+-]+)\s*[±]\s*[\d.eE+-]+$", s)
+        if pm_match:
+            return float(pm_match.group(1))
+        return float(s)
+    def _normalize_unit(self, unit: str) -> str:
+        normalized = normalize_minus_signs(str(unit or "").strip())
+        normalized = normalized.replace("²", "^2").replace("³", "^3")
+        normalized = normalized.replace("·", "*").replace(" ", " ")
+        key = re.sub(r"\s+", " ", normalized.lower()).strip()
+        return self.UNIT_ALIASES.get(key, normalized)
+    def standardize_data_points(self, data_points: List) -> List:
+        for dp in data_points:
+            result = self.standardize(
+                property_name=dp.property_name,
+                raw_value=dp.raw_value,
+                raw_unit=dp.raw_unit,
+            )
+            if result.success:
+                dp.standardized_value = result.value
+                dp.standardized_unit = result.unit
+            else:
+                dp.standardization_error = result.error
+        return data_points

scripts/__pycache__/run_literature_mining.cpython-313.pyc ADDED Viewed

Binary file (7.79 kB). View file

scripts/evaluate_polyie.py ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from literature.evaluation import evaluate_predictions, load_json_records
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Evaluate extraction output against a POLYIE-style gold file.")
+    parser.add_argument("--gold", required=True, help="Gold file (.json or .jsonl)")
+    parser.add_argument("--pred", required=True, help="Prediction file (.json or .jsonl)")
+    parser.add_argument("--out", default=None, help="Optional JSON output path")
+    args = parser.parse_args()
+    gold_records = load_json_records(args.gold)
+    predicted_records = load_json_records(args.pred)
+    metrics = evaluate_predictions(gold_records, predicted_records)
+    text = json.dumps(metrics, indent=2, ensure_ascii=False)
+    print(text)
+    if args.out:
+        Path(args.out).write_text(text + "\n", encoding="utf-8")
+if __name__ == "__main__":
+    main()

scripts/run_literature_mining.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python3
+"""
+Project-based literature mining CLI.
+Examples:
+  python scripts/run_literature_mining.py --query "PEDOT:PSS thermoelectric" --limit 5
+  python scripts/run_literature_mining.py --project-id proj_xxx --query "P3HT conductivity" --save-mode files
+"""
+from __future__ import annotations
+import argparse
+import csv
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+from dotenv import load_dotenv
+from src.literature_service import (
+    DataPointRepo,
+    LiteraturePipeline,
+    ProjectRepo,
+    QueryIntentService,
+    QuerySessionRepo,
+    get_database,
+)
+load_dotenv()
+def resolve_project_id(project_id: str | None, projects: ProjectRepo) -> str:
+    if project_id:
+        project = projects.get_project(project_id)
+        if not project:
+            raise ValueError(f"Project not found: {project_id}")
+        return project_id
+    existing = projects.list_projects()
+    if existing:
+        return existing[0]["id"]
+    created = projects.create_project(
+        name="Default Literature Project",
+        description="Auto-created by run_literature_mining.py",
+    )
+    return created["id"]
+def export_points_to_files(project_id: str, points: List[Dict[str, Any]], out_dir: Path) -> None:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    jsonl_path = out_dir / "validated_points.jsonl"
+    with jsonl_path.open("w", encoding="utf-8") as f:
+        for row in points:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+    csv_path = out_dir / "validated_points.csv"
+    if points:
+        with csv_path.open("w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=list(points[0].keys()))
+            writer.writeheader()
+            writer.writerows(points)
+    else:
+        csv_path.write_text("point_id,project_id\n", encoding="utf-8")
+    print(f"Exported {len(points)} rows to:")
+    print(f"  - {jsonl_path}")
+    print(f"  - {csv_path}")
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Project-based Literature Mining CLI")
+    parser.add_argument("--project-id", default=None, help="Target project ID")
+    parser.add_argument("--query", default="PEDOT:PSS thermoelectric conductivity", help="Search query")
+    parser.add_argument("--limit", type=int, default=5, help="Max papers per source")
+    parser.add_argument("--strategy", choices=["simple", "paperqa"], default="simple", help="Extraction strategy")
+    parser.add_argument("--model-provider", default="openai_compatible", help="Model provider name")
+    parser.add_argument("--model-name", default="gpt-oss:latest", help="Model name")
+    parser.add_argument("--save-mode", choices=["sqlite", "files"], default="sqlite", help="Result sink mode")
+    parser.add_argument("--no-save", action="store_true", help="Do not persist result to sqlite")
+    parser.add_argument("--manual-upload-dir", default="data/literature/manual_uploads", help="Reserved for batch manual upload")
+    args = parser.parse_args()
+    db = get_database("data/app.db")
+    project_repo = ProjectRepo(db)
+    point_repo = DataPointRepo(db)
+    query_repo = QuerySessionRepo(db)
+    query_intent = QueryIntentService(query_repo)
+    pipeline = LiteraturePipeline(db_path="data/app.db")
+    target_project_id = resolve_project_id(args.project_id, project_repo)
+    project = project_repo.get_project(target_project_id)
+    print("=" * 64)
+    print("Project-Based Literature Mining")
+    print(f"Project: {project['name']} ({target_project_id})")
+    print(f"Query: {args.query}")
+    print(f"Limit per source: {args.limit}")
+    print(f"Strategy: {args.strategy}")
+    print("=" * 64)
+    query_session = query_intent.analyze_and_store(target_project_id, args.query)
+    suggestions = json.loads(query_session.get("suggestions_json") or "[]")
+    if suggestions:
+        print("Query suggestions:")
+        for s in suggestions:
+            print(f"  - {s}")
+    if query_session.get("clarification_required"):
+        print("Note: query marked as pending_clarification. Continuing by CLI override.")
+    if args.no_save:
+        discovered = pipeline.run_discovery(target_project_id, args.query, args.limit)
+        retrieved = pipeline.run_retrieval(target_project_id, discovered)
+        stats = pipeline.run_extraction(
+            target_project_id,
+            run_id=None,
+            paper_rows=retrieved,
+            strategy=args.strategy,
+            model_name=args.model_name,
+            use_full_text=True,
+        )
+        print(f"Extraction complete without DB run record: {stats}")
+    else:
+        result = pipeline.run_full_pipeline(
+            project_id=target_project_id,
+            query=args.query,
+            limit=args.limit,
+            strategy=args.strategy,
+            model_provider=args.model_provider,
+            model_name=args.model_name,
+            use_full_text=True,
+        )
+        print(f"Pipeline status: {result.get('status')}")
+        if result.get("status") != "completed":
+            print(f"Error: {result.get('error')}")
+        else:
+            print(json.dumps(result.get("stats", {}), indent=2))
+    points = point_repo.list_points(target_project_id)
+    if args.save_mode == "files":
+        run_dir = Path("data/literature/runs")
+        export_points_to_files(target_project_id, points, run_dir)
+    print("=" * 64)
+    print("Done.")
+    print("=" * 64)
+if __name__ == "__main__":
+    main()

scripts/train_prior_slurm.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/bin/bash
+#SBATCH --job-name=polymer_prior
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=64G
+#SBATCH --time=24:00:00
+#SBATCH --output=logs/train_prior_%j.out
+#SBATCH --error=logs/train_prior_%j.err
+set -euo pipefail
+# Adjust these for your CRC environment
+REPO_DIR="/Users/xuguoyue/Documents/GitHub/POLYMER-PROPERTY"
+VENV_DIR="$REPO_DIR/.venv"
+cd "$REPO_DIR"
+# Load modules if your CRC requires it (example)
+# module load python/3.10
+source "$VENV_DIR/bin/activate"
+mkdir -p logs
+export OMP_NUM_THREADS=8
+export MKL_NUM_THREADS=8
+torchrun --nproc_per_node=4 RNN/train_prior.py \
+  --smiles-csv data/PI1M.csv \
+  --vocab RNN/pretrained_model/voc \
+  --output RNN/pretrained_model/Prior.ckpt \
+  --epochs 10 \
+  --batch-size 256 \
+  --lr 1e-3 \
+  --max-length 140 \
+  --num-workers 4 \
+  --log-every 200

src/.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

src/__pycache__/conv.cpython-310.pyc ADDED Viewed

Binary file (7.21 kB). View file

src/__pycache__/conv.cpython-313.pyc ADDED Viewed

Binary file (11.2 kB). View file

src/__pycache__/data_builder.cpython-310.pyc ADDED Viewed

Binary file (24 kB). View file

src/__pycache__/data_builder.cpython-313.pyc ADDED Viewed

Binary file (40.6 kB). View file

src/__pycache__/discover_llm.cpython-310.pyc ADDED Viewed

Binary file (23 kB). View file

src/__pycache__/discover_llm.cpython-313.pyc ADDED Viewed

Binary file (37.7 kB). View file

src/__pycache__/discovery.cpython-310.pyc ADDED Viewed

Binary file (21 kB). View file

src/__pycache__/discovery.cpython-313.pyc ADDED Viewed

Binary file (34.6 kB). View file