Upload 119 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- literature/__init__.py +100 -0
- literature/__pycache__/__init__.cpython-310.pyc +0 -0
- literature/__pycache__/__init__.cpython-313.pyc +0 -0
- literature/__pycache__/clarifier.cpython-310.pyc +0 -0
- literature/__pycache__/clarifier.cpython-313.pyc +0 -0
- literature/__pycache__/config.cpython-310.pyc +0 -0
- literature/__pycache__/config.cpython-313.pyc +0 -0
- literature/__pycache__/converters.cpython-310.pyc +0 -0
- literature/__pycache__/converters.cpython-313.pyc +0 -0
- literature/__pycache__/discovery.cpython-310.pyc +0 -0
- literature/__pycache__/discovery.cpython-313.pyc +0 -0
- literature/__pycache__/evaluation.cpython-310.pyc +0 -0
- literature/__pycache__/extraction.cpython-310.pyc +0 -0
- literature/__pycache__/extraction.cpython-313.pyc +0 -0
- literature/__pycache__/graph.cpython-313.pyc +0 -0
- literature/__pycache__/property_registry.cpython-310.pyc +0 -0
- literature/__pycache__/property_registry.cpython-313.pyc +0 -0
- literature/__pycache__/quality.cpython-310.pyc +0 -0
- literature/__pycache__/quality.cpython-313.pyc +0 -0
- literature/__pycache__/retrieval.cpython-310.pyc +0 -0
- literature/__pycache__/retrieval.cpython-313.pyc +0 -0
- literature/__pycache__/schemas.cpython-310.pyc +0 -0
- literature/__pycache__/schemas.cpython-313.pyc +0 -0
- literature/__pycache__/standardizer.cpython-310.pyc +0 -0
- literature/__pycache__/standardizer.cpython-313.pyc +0 -0
- literature/clarifier.py +89 -0
- literature/config.py +71 -0
- literature/converters.py +56 -0
- literature/discovery.py +380 -0
- literature/evaluation.py +155 -0
- literature/extraction.py +863 -0
- literature/graph.py +450 -0
- literature/property_registry.py +274 -0
- literature/quality.py +176 -0
- literature/retrieval.py +398 -0
- literature/schemas.py +329 -0
- literature/standardizer.py +211 -0
- scripts/__pycache__/run_literature_mining.cpython-313.pyc +0 -0
- scripts/evaluate_polyie.py +29 -0
- scripts/run_literature_mining.py +149 -0
- scripts/train_prior_slurm.sh +38 -0
- src/.DS_Store +0 -0
- src/__pycache__/conv.cpython-310.pyc +0 -0
- src/__pycache__/conv.cpython-313.pyc +0 -0
- src/__pycache__/data_builder.cpython-310.pyc +0 -0
- src/__pycache__/data_builder.cpython-313.pyc +0 -0
- src/__pycache__/discover_llm.cpython-310.pyc +0 -0
- src/__pycache__/discover_llm.cpython-313.pyc +0 -0
- src/__pycache__/discovery.cpython-310.pyc +0 -0
- src/__pycache__/discovery.cpython-313.pyc +0 -0
literature/__init__.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Literature mining package for project-based extraction workflows."""
|
| 2 |
+
|
| 3 |
+
from .schemas import (
|
| 4 |
+
ContextualizedValue,
|
| 5 |
+
DataQuality,
|
| 6 |
+
ExperimentalConditions,
|
| 7 |
+
ExtractionResult,
|
| 8 |
+
LiteratureEvidenceRecord,
|
| 9 |
+
LiteratureQuerySpec,
|
| 10 |
+
LiteratureSupportSummary,
|
| 11 |
+
PaperMetadata,
|
| 12 |
+
PaperCardResult,
|
| 13 |
+
PaperSource,
|
| 14 |
+
PolymerDataPoint,
|
| 15 |
+
QueryMode,
|
| 16 |
+
ReviewStatus,
|
| 17 |
+
)
|
| 18 |
+
from .property_registry import (
|
| 19 |
+
PROPERTY_CATALOG,
|
| 20 |
+
PLATFORM_PROPERTY_KEYS,
|
| 21 |
+
TEMPLATES,
|
| 22 |
+
TEMPLATE_LABELS,
|
| 23 |
+
build_extraction_prompt,
|
| 24 |
+
detect_property_keys,
|
| 25 |
+
normalize_property_key,
|
| 26 |
+
property_display_name,
|
| 27 |
+
)
|
| 28 |
+
from .quality import QualityAssessor, QualityReport
|
| 29 |
+
from .standardizer import StandardizationResult, UnitStandardizer, normalize_minus_signs
|
| 30 |
+
from .clarifier import ClarifierAgent, QueryAnalysis
|
| 31 |
+
from .evaluation import evaluate_predictions, load_json_records
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
from .config import LiteratureConfig, get_config
|
| 35 |
+
except Exception: # pragma: no cover - optional runtime dependency
|
| 36 |
+
LiteratureConfig = None # type: ignore
|
| 37 |
+
get_config = None # type: ignore
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
from .discovery import PaperDiscoveryAgent
|
| 41 |
+
except Exception: # pragma: no cover - optional runtime dependency
|
| 42 |
+
PaperDiscoveryAgent = None # type: ignore
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
from .retrieval import PDFRetriever, extract_text_from_pdf
|
| 46 |
+
except Exception: # pragma: no cover - optional runtime dependency
|
| 47 |
+
PDFRetriever = None # type: ignore
|
| 48 |
+
extract_text_from_pdf = None # type: ignore
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
from .extraction import ContextualizedExtractor, DataExtractor
|
| 52 |
+
except Exception: # pragma: no cover - optional runtime dependency
|
| 53 |
+
ContextualizedExtractor = None # type: ignore
|
| 54 |
+
DataExtractor = None # type: ignore
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
from .converters import to_experiment_result
|
| 58 |
+
except Exception: # pragma: no cover - optional runtime dependency
|
| 59 |
+
to_experiment_result = None # type: ignore
|
| 60 |
+
|
| 61 |
+
__all__ = [
|
| 62 |
+
"LiteratureConfig",
|
| 63 |
+
"get_config",
|
| 64 |
+
"PaperMetadata",
|
| 65 |
+
"PaperSource",
|
| 66 |
+
"PolymerDataPoint",
|
| 67 |
+
"ExtractionResult",
|
| 68 |
+
"DataQuality",
|
| 69 |
+
"ContextualizedValue",
|
| 70 |
+
"ExperimentalConditions",
|
| 71 |
+
"LiteratureQuerySpec",
|
| 72 |
+
"PaperCardResult",
|
| 73 |
+
"LiteratureEvidenceRecord",
|
| 74 |
+
"LiteratureSupportSummary",
|
| 75 |
+
"QueryMode",
|
| 76 |
+
"ReviewStatus",
|
| 77 |
+
"PaperDiscoveryAgent",
|
| 78 |
+
"PDFRetriever",
|
| 79 |
+
"extract_text_from_pdf",
|
| 80 |
+
"DataExtractor",
|
| 81 |
+
"ContextualizedExtractor",
|
| 82 |
+
"QualityAssessor",
|
| 83 |
+
"QualityReport",
|
| 84 |
+
"UnitStandardizer",
|
| 85 |
+
"normalize_minus_signs",
|
| 86 |
+
"StandardizationResult",
|
| 87 |
+
"ClarifierAgent",
|
| 88 |
+
"QueryAnalysis",
|
| 89 |
+
"evaluate_predictions",
|
| 90 |
+
"load_json_records",
|
| 91 |
+
"to_experiment_result",
|
| 92 |
+
"PROPERTY_CATALOG",
|
| 93 |
+
"PLATFORM_PROPERTY_KEYS",
|
| 94 |
+
"TEMPLATES",
|
| 95 |
+
"TEMPLATE_LABELS",
|
| 96 |
+
"build_extraction_prompt",
|
| 97 |
+
"detect_property_keys",
|
| 98 |
+
"normalize_property_key",
|
| 99 |
+
"property_display_name",
|
| 100 |
+
]
|
literature/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (2 kB). View file
|
|
|
literature/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (1.42 kB). View file
|
|
|
literature/__pycache__/clarifier.cpython-310.pyc
ADDED
|
Binary file (2.53 kB). View file
|
|
|
literature/__pycache__/clarifier.cpython-313.pyc
ADDED
|
Binary file (3.13 kB). View file
|
|
|
literature/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (2.75 kB). View file
|
|
|
literature/__pycache__/config.cpython-313.pyc
ADDED
|
Binary file (3.39 kB). View file
|
|
|
literature/__pycache__/converters.cpython-310.pyc
ADDED
|
Binary file (1.76 kB). View file
|
|
|
literature/__pycache__/converters.cpython-313.pyc
ADDED
|
Binary file (2.57 kB). View file
|
|
|
literature/__pycache__/discovery.cpython-310.pyc
ADDED
|
Binary file (10.8 kB). View file
|
|
|
literature/__pycache__/discovery.cpython-313.pyc
ADDED
|
Binary file (16.4 kB). View file
|
|
|
literature/__pycache__/evaluation.cpython-310.pyc
ADDED
|
Binary file (5.28 kB). View file
|
|
|
literature/__pycache__/extraction.cpython-310.pyc
ADDED
|
Binary file (22.6 kB). View file
|
|
|
literature/__pycache__/extraction.cpython-313.pyc
ADDED
|
Binary file (30.2 kB). View file
|
|
|
literature/__pycache__/graph.cpython-313.pyc
ADDED
|
Binary file (15.6 kB). View file
|
|
|
literature/__pycache__/property_registry.cpython-310.pyc
ADDED
|
Binary file (8.36 kB). View file
|
|
|
literature/__pycache__/property_registry.cpython-313.pyc
ADDED
|
Binary file (6.76 kB). View file
|
|
|
literature/__pycache__/quality.cpython-310.pyc
ADDED
|
Binary file (5.71 kB). View file
|
|
|
literature/__pycache__/quality.cpython-313.pyc
ADDED
|
Binary file (7.55 kB). View file
|
|
|
literature/__pycache__/retrieval.cpython-310.pyc
ADDED
|
Binary file (9.92 kB). View file
|
|
|
literature/__pycache__/retrieval.cpython-313.pyc
ADDED
|
Binary file (16 kB). View file
|
|
|
literature/__pycache__/schemas.cpython-310.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
literature/__pycache__/schemas.cpython-313.pyc
ADDED
|
Binary file (11.6 kB). View file
|
|
|
literature/__pycache__/standardizer.cpython-310.pyc
ADDED
|
Binary file (6.41 kB). View file
|
|
|
literature/__pycache__/standardizer.cpython-313.pyc
ADDED
|
Binary file (8.84 kB). View file
|
|
|
literature/clarifier.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Dict, List
|
| 5 |
+
|
| 6 |
+
from .property_registry import detect_property_keys, property_display_name
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
POLYMER_KEYWORDS = {
|
| 10 |
+
"polymer",
|
| 11 |
+
"polyimide",
|
| 12 |
+
"peek",
|
| 13 |
+
"polyethylene",
|
| 14 |
+
"pedot",
|
| 15 |
+
"pedot:pss",
|
| 16 |
+
"p3ht",
|
| 17 |
+
"smiles",
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
CONDITION_KEYWORDS = {
|
| 21 |
+
"anneal",
|
| 22 |
+
"annealing",
|
| 23 |
+
"solvent",
|
| 24 |
+
"dopant",
|
| 25 |
+
"doping",
|
| 26 |
+
"spin coat",
|
| 27 |
+
"temperature",
|
| 28 |
+
"thickness",
|
| 29 |
+
"pressure",
|
| 30 |
+
"humidity",
|
| 31 |
+
"method",
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class QueryAnalysis:
|
| 37 |
+
original_query: str
|
| 38 |
+
detected_polymers: List[str]
|
| 39 |
+
detected_properties: List[str]
|
| 40 |
+
detected_conditions: List[str]
|
| 41 |
+
suggestions: List[str]
|
| 42 |
+
clarification_required: bool
|
| 43 |
+
status: str
|
| 44 |
+
|
| 45 |
+
def to_payload(self) -> Dict[str, object]:
|
| 46 |
+
return {
|
| 47 |
+
"original_query": self.original_query,
|
| 48 |
+
"detected_polymers": self.detected_polymers,
|
| 49 |
+
"detected_properties": self.detected_properties,
|
| 50 |
+
"detected_conditions": self.detected_conditions,
|
| 51 |
+
"suggestions": self.suggestions,
|
| 52 |
+
"clarification_required": self.clarification_required,
|
| 53 |
+
"status": self.status,
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class ClarifierAgent:
|
| 58 |
+
"""
|
| 59 |
+
Lightweight clarifier for production search flows.
|
| 60 |
+
It nudges users toward material + property + condition context without
|
| 61 |
+
blocking valid free-form task queries.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
def analyze(self, query: str) -> QueryAnalysis:
|
| 65 |
+
q = (query or "").lower()
|
| 66 |
+
polymers = [keyword for keyword in POLYMER_KEYWORDS if keyword in q]
|
| 67 |
+
properties = detect_property_keys(query or "")
|
| 68 |
+
conditions = [keyword for keyword in CONDITION_KEYWORDS if keyword in q]
|
| 69 |
+
|
| 70 |
+
suggestions: List[str] = []
|
| 71 |
+
if not polymers:
|
| 72 |
+
suggestions.append("Add a target polymer or material name.")
|
| 73 |
+
if not properties:
|
| 74 |
+
suggestions.append("Specify a key property focus, e.g. " + property_display_name("tg") + ".")
|
| 75 |
+
if not conditions:
|
| 76 |
+
suggestions.append("Add one processing or measurement condition if available.")
|
| 77 |
+
|
| 78 |
+
clarification_required = (not polymers) and (not properties)
|
| 79 |
+
status = "pending_clarification" if clarification_required else "ready"
|
| 80 |
+
|
| 81 |
+
return QueryAnalysis(
|
| 82 |
+
original_query=query,
|
| 83 |
+
detected_polymers=polymers,
|
| 84 |
+
detected_properties=properties,
|
| 85 |
+
detected_conditions=conditions,
|
| 86 |
+
suggestions=suggestions,
|
| 87 |
+
clarification_required=clarification_required,
|
| 88 |
+
status=status,
|
| 89 |
+
)
|
literature/config.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration management for Literature Discovery module.
|
| 3 |
+
Uses pydantic-settings for environment variable loading.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Optional, List
|
| 6 |
+
from pydantic import Field
|
| 7 |
+
from pydantic_settings import BaseSettings
|
| 8 |
+
from functools import lru_cache
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class LiteratureConfig(BaseSettings):
|
| 12 |
+
"""Literature mining configuration."""
|
| 13 |
+
|
| 14 |
+
# API Keys
|
| 15 |
+
pubmed_email: str = Field(default="scholar@university.edu", alias="PUBMED_EMAIL")
|
| 16 |
+
pubmed_api_key: Optional[str] = Field(default=None, alias="PUBMED_API_KEY")
|
| 17 |
+
semantic_scholar_api_key: Optional[str] = Field(default=None, alias="SEMANTIC_SCHOLAR_API_KEY")
|
| 18 |
+
gemini_api_key: Optional[str] = Field(default=None, alias="GEMINI_API_KEY")
|
| 19 |
+
openai_api_key: Optional[str] = Field(default=None, alias="MY_OPEN_WEBUI_API_KEY")
|
| 20 |
+
openai_base_url: Optional[str] = Field(default=None, alias="OPENAI_BASE_URL")
|
| 21 |
+
pageindex_api_key: Optional[str] = Field(default=None, alias="PAGEINDEX_API_KEY")
|
| 22 |
+
|
| 23 |
+
# LLM Configuration
|
| 24 |
+
llm_model: str = Field(default="gemini/gemini-2.0-flash", alias="LLM_MODEL")
|
| 25 |
+
embedding_model: str = Field(default="gemini/text-embedding-004")
|
| 26 |
+
llm_temperature: float = Field(default=0.1, ge=0.0, le=1.0)
|
| 27 |
+
llm_max_tokens: int = Field(default=4096)
|
| 28 |
+
|
| 29 |
+
# Search Configuration
|
| 30 |
+
default_search_limit: int = Field(default=20)
|
| 31 |
+
pubmed_enabled: bool = Field(default=True)
|
| 32 |
+
arxiv_enabled: bool = Field(default=True)
|
| 33 |
+
semantic_scholar_enabled: bool = Field(default=True) # Now enabled
|
| 34 |
+
|
| 35 |
+
# Rate Limiting (Semantic Scholar: 1 req/sec)
|
| 36 |
+
semantic_scholar_delay_s: float = Field(default=1.5) # Slightly over 1s for safety
|
| 37 |
+
pubmed_delay_s: float = Field(default=0.5)
|
| 38 |
+
|
| 39 |
+
# Storage
|
| 40 |
+
pdf_storage_dir: str = Field(default="data/literature/raw_pdfs")
|
| 41 |
+
database_path: str = Field(default="data/literature/papers.db")
|
| 42 |
+
|
| 43 |
+
# Processing
|
| 44 |
+
max_concurrent_downloads: int = Field(default=3)
|
| 45 |
+
extraction_timeout_s: int = Field(default=120)
|
| 46 |
+
|
| 47 |
+
# PDF Download Headers (for avoiding 403)
|
| 48 |
+
user_agent: str = Field(
|
| 49 |
+
default="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Target Polymers (for focused search)
|
| 53 |
+
target_polymers: List[str] = Field(
|
| 54 |
+
default=["PEDOT:PSS", "P3HT", "PBTTT", "P(NDI2OD-T2)", "PDPP-4T"]
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Extraction strategy: "paperqa" or "simple"
|
| 58 |
+
extraction_strategy: str = Field(default="simple")
|
| 59 |
+
literature_model_options: str = Field(default="[]", alias="LITERATURE_MODEL_OPTIONS")
|
| 60 |
+
|
| 61 |
+
model_config = {
|
| 62 |
+
"env_file": ".env",
|
| 63 |
+
"env_file_encoding": "utf-8",
|
| 64 |
+
"extra": "ignore",
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@lru_cache()
|
| 69 |
+
def get_config() -> LiteratureConfig:
|
| 70 |
+
"""Get configuration singleton."""
|
| 71 |
+
return LiteratureConfig()
|
literature/converters.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data model converters.
|
| 3 |
+
|
| 4 |
+
This module is now schema-optional:
|
| 5 |
+
- If legacy `src.utils.schema` exists, returns (Experiment, Result) objects.
|
| 6 |
+
- Otherwise returns two plain dict payloads for compatibility.
|
| 7 |
+
"""
|
| 8 |
+
import time
|
| 9 |
+
from typing import Any, Dict, Tuple
|
| 10 |
+
|
| 11 |
+
from .schemas import PolymerDataPoint
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from src.utils.schema import Experiment, Result # type: ignore
|
| 15 |
+
HAS_LEGACY_SCHEMA = True
|
| 16 |
+
except Exception:
|
| 17 |
+
Experiment = None # type: ignore
|
| 18 |
+
Result = None # type: ignore
|
| 19 |
+
HAS_LEGACY_SCHEMA = False
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def to_experiment_result(dp: PolymerDataPoint) -> Tuple[Any, Any]:
|
| 23 |
+
exp_id = f"lit_{dp.source_paper_id}_{int(time.time() * 1000)}"
|
| 24 |
+
exp_payload: Dict[str, Any] = {
|
| 25 |
+
"id": exp_id,
|
| 26 |
+
"polymer_id": dp.polymer_name,
|
| 27 |
+
"concentration_mg_ml": dp.concentration_mg_ml or 0.0,
|
| 28 |
+
"spin_speed_rpm": dp.spin_speed_rpm or 0,
|
| 29 |
+
"annealing_temp_c": dp.annealing_temp_c or 0.0,
|
| 30 |
+
"annealing_time_min": dp.annealing_time_min or 0.0,
|
| 31 |
+
"status": "completed",
|
| 32 |
+
"metadata": {
|
| 33 |
+
"dopant": dp.dopant,
|
| 34 |
+
"dopant_ratio": dp.dopant_ratio,
|
| 35 |
+
"solvent": dp.solvent,
|
| 36 |
+
"source_paper_id": dp.source_paper_id,
|
| 37 |
+
"source_table": dp.source_table_or_figure,
|
| 38 |
+
"quality_tier": dp.quality_tier.value,
|
| 39 |
+
"extraction_confidence": dp.extraction_confidence,
|
| 40 |
+
"film_thickness_nm": dp.film_thickness_nm,
|
| 41 |
+
"seebeck_coefficient_uv_k": dp.seebeck_coefficient_uv_k,
|
| 42 |
+
"power_factor_uw_m_k2": dp.power_factor_uw_m_k2,
|
| 43 |
+
},
|
| 44 |
+
}
|
| 45 |
+
res_payload: Dict[str, Any] = {
|
| 46 |
+
"experiment_id": exp_id,
|
| 47 |
+
"ec_s_cm": dp.electrical_conductivity_s_cm or 0.0,
|
| 48 |
+
"tc_w_mk": dp.thermal_conductivity_w_mk,
|
| 49 |
+
"xrd_crystallinity": dp.xrd_crystallinity_percent,
|
| 50 |
+
"xrd_pi_stacking_angstrom": dp.xrd_pi_stacking_angstrom,
|
| 51 |
+
"source": "literature",
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
if HAS_LEGACY_SCHEMA:
|
| 55 |
+
return Experiment(**exp_payload), Result(**res_payload) # type: ignore
|
| 56 |
+
return exp_payload, res_payload
|
literature/discovery.py
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multi-source paper discovery module.
|
| 3 |
+
Implements PubMed, ArXiv, and Semantic Scholar search.
|
| 4 |
+
Uses synchronous code for MVP simplicity.
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
import time
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
|
| 10 |
+
import arxiv
|
| 11 |
+
from Bio import Entrez
|
| 12 |
+
|
| 13 |
+
from .schemas import PaperMetadata, PaperSource
|
| 14 |
+
from .config import get_config
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
_SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED = False
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ArxivSearcher:
|
| 21 |
+
"""ArXiv paper searcher."""
|
| 22 |
+
|
| 23 |
+
def __init__(self) -> None:
|
| 24 |
+
self.client = arxiv.Client()
|
| 25 |
+
|
| 26 |
+
def search(self, query: str, limit: int = 10) -> List[PaperMetadata]:
|
| 27 |
+
"""
|
| 28 |
+
Search ArXiv for papers.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
query: Search query string
|
| 32 |
+
limit: Maximum number of results
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
List of PaperMetadata objects
|
| 36 |
+
"""
|
| 37 |
+
logger.info(f"Searching ArXiv: '{query}' (limit={limit})")
|
| 38 |
+
|
| 39 |
+
search = arxiv.Search(
|
| 40 |
+
query=query,
|
| 41 |
+
max_results=limit,
|
| 42 |
+
sort_by=arxiv.SortCriterion.Relevance
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
papers: List[PaperMetadata] = []
|
| 46 |
+
try:
|
| 47 |
+
for result in self.client.results(search):
|
| 48 |
+
# Extract arxiv ID without version
|
| 49 |
+
arxiv_id = result.entry_id.split('/')[-1].split('v')[0]
|
| 50 |
+
|
| 51 |
+
paper = PaperMetadata(
|
| 52 |
+
id=f"arxiv_{arxiv_id}",
|
| 53 |
+
title=result.title,
|
| 54 |
+
authors=[a.name for a in result.authors],
|
| 55 |
+
year=result.published.year if result.published else None,
|
| 56 |
+
doi=result.doi,
|
| 57 |
+
abstract=result.summary,
|
| 58 |
+
venue="arXiv",
|
| 59 |
+
citation_count=None,
|
| 60 |
+
is_open_access=True,
|
| 61 |
+
source=PaperSource.ARXIV,
|
| 62 |
+
url=result.entry_id,
|
| 63 |
+
landing_url=result.entry_id,
|
| 64 |
+
pdf_url=result.pdf_url,
|
| 65 |
+
)
|
| 66 |
+
papers.append(paper)
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.error(f"ArXiv search failed: {e}")
|
| 69 |
+
|
| 70 |
+
logger.info(f"ArXiv returned {len(papers)} papers")
|
| 71 |
+
return papers
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class PubMedSearcher:
|
| 75 |
+
"""PubMed paper searcher using Biopython Entrez."""
|
| 76 |
+
|
| 77 |
+
def __init__(self) -> None:
|
| 78 |
+
config = get_config()
|
| 79 |
+
Entrez.email = config.pubmed_email
|
| 80 |
+
if config.pubmed_api_key:
|
| 81 |
+
Entrez.api_key = config.pubmed_api_key
|
| 82 |
+
self.delay = config.pubmed_delay_s
|
| 83 |
+
|
| 84 |
+
def search(self, query: str, limit: int = 10) -> List[PaperMetadata]:
|
| 85 |
+
"""
|
| 86 |
+
Search PubMed for papers.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
query: Search query string
|
| 90 |
+
limit: Maximum number of results
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
List of PaperMetadata objects
|
| 94 |
+
"""
|
| 95 |
+
logger.info(f"Searching PubMed: '{query}' (limit={limit})")
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
# Step 1: Search for IDs
|
| 99 |
+
handle = Entrez.esearch(db="pubmed", term=query, retmax=limit)
|
| 100 |
+
record = Entrez.read(handle)
|
| 101 |
+
handle.close()
|
| 102 |
+
|
| 103 |
+
id_list = record.get("IdList", [])
|
| 104 |
+
if not id_list:
|
| 105 |
+
logger.info("PubMed returned 0 papers")
|
| 106 |
+
return []
|
| 107 |
+
|
| 108 |
+
time.sleep(self.delay)
|
| 109 |
+
|
| 110 |
+
# Step 2: Fetch details in XML format
|
| 111 |
+
handle = Entrez.efetch(
|
| 112 |
+
db="pubmed",
|
| 113 |
+
id=id_list,
|
| 114 |
+
rettype="xml",
|
| 115 |
+
retmode="xml"
|
| 116 |
+
)
|
| 117 |
+
records = Entrez.read(handle)
|
| 118 |
+
handle.close()
|
| 119 |
+
|
| 120 |
+
papers: List[PaperMetadata] = []
|
| 121 |
+
for article in records.get("PubmedArticle", []):
|
| 122 |
+
try:
|
| 123 |
+
paper = self._parse_pubmed_article(article)
|
| 124 |
+
if paper:
|
| 125 |
+
papers.append(paper)
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.warning(f"Failed to parse PubMed article: {e}")
|
| 128 |
+
|
| 129 |
+
logger.info(f"PubMed returned {len(papers)} papers")
|
| 130 |
+
return papers
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
logger.error(f"PubMed search failed: {e}")
|
| 134 |
+
return []
|
| 135 |
+
|
| 136 |
+
def _parse_pubmed_article(self, article: dict) -> Optional[PaperMetadata]:
|
| 137 |
+
"""Parse a single PubMed article into PaperMetadata."""
|
| 138 |
+
medline = article.get("MedlineCitation", {})
|
| 139 |
+
article_data = medline.get("Article", {})
|
| 140 |
+
|
| 141 |
+
# Extract PMID
|
| 142 |
+
pmid = str(medline.get("PMID", ""))
|
| 143 |
+
if not pmid:
|
| 144 |
+
return None
|
| 145 |
+
|
| 146 |
+
# Extract title
|
| 147 |
+
title = article_data.get("ArticleTitle", "Unknown Title")
|
| 148 |
+
if isinstance(title, list):
|
| 149 |
+
title = " ".join(str(t) for t in title)
|
| 150 |
+
|
| 151 |
+
# Extract authors
|
| 152 |
+
authors: List[str] = []
|
| 153 |
+
author_list = article_data.get("AuthorList", [])
|
| 154 |
+
for author in author_list:
|
| 155 |
+
if isinstance(author, dict):
|
| 156 |
+
last_name = author.get("LastName", "")
|
| 157 |
+
fore_name = author.get("ForeName", "")
|
| 158 |
+
if last_name:
|
| 159 |
+
authors.append(f"{fore_name} {last_name}".strip())
|
| 160 |
+
|
| 161 |
+
# Extract year
|
| 162 |
+
year = None
|
| 163 |
+
pub_date = article_data.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
|
| 164 |
+
if "Year" in pub_date:
|
| 165 |
+
try:
|
| 166 |
+
year = int(pub_date["Year"])
|
| 167 |
+
except (ValueError, TypeError):
|
| 168 |
+
pass
|
| 169 |
+
|
| 170 |
+
# Extract abstract
|
| 171 |
+
abstract = ""
|
| 172 |
+
abstract_text = article_data.get("Abstract", {}).get("AbstractText", [])
|
| 173 |
+
if isinstance(abstract_text, list):
|
| 174 |
+
abstract = " ".join(str(t) for t in abstract_text)
|
| 175 |
+
elif isinstance(abstract_text, str):
|
| 176 |
+
abstract = abstract_text
|
| 177 |
+
|
| 178 |
+
# Extract DOI
|
| 179 |
+
doi = None
|
| 180 |
+
id_list = article_data.get("ELocationID", [])
|
| 181 |
+
for eid in id_list:
|
| 182 |
+
if hasattr(eid, "attributes") and eid.attributes.get("EIdType") == "doi":
|
| 183 |
+
doi = str(eid)
|
| 184 |
+
break
|
| 185 |
+
|
| 186 |
+
journal = article_data.get("Journal", {})
|
| 187 |
+
journal_title = journal.get("Title")
|
| 188 |
+
|
| 189 |
+
return PaperMetadata(
|
| 190 |
+
id=f"pubmed_{pmid}",
|
| 191 |
+
title=str(title),
|
| 192 |
+
authors=authors,
|
| 193 |
+
year=year,
|
| 194 |
+
doi=doi,
|
| 195 |
+
abstract=abstract,
|
| 196 |
+
venue=str(journal_title) if journal_title else None,
|
| 197 |
+
citation_count=None,
|
| 198 |
+
is_open_access=None,
|
| 199 |
+
source=PaperSource.PUBMED,
|
| 200 |
+
url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
|
| 201 |
+
landing_url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
class SemanticScholarSearcher:
|
| 206 |
+
"""Semantic Scholar paper searcher (with rate limiting)."""
|
| 207 |
+
|
| 208 |
+
def __init__(self) -> None:
|
| 209 |
+
config = get_config()
|
| 210 |
+
self.api_key = config.semantic_scholar_api_key
|
| 211 |
+
self.delay = config.semantic_scholar_delay_s
|
| 212 |
+
|
| 213 |
+
def search(self, query: str, limit: int = 10) -> List[PaperMetadata]:
|
| 214 |
+
"""
|
| 215 |
+
Search Semantic Scholar for papers.
|
| 216 |
+
Rate limited to avoid 403 errors.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
query: Search query string
|
| 220 |
+
limit: Maximum number of results
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
List of PaperMetadata objects
|
| 224 |
+
"""
|
| 225 |
+
logger.info(f"Searching Semantic Scholar: '{query}' (limit={limit})")
|
| 226 |
+
|
| 227 |
+
# Lazy import to avoid dependency issues
|
| 228 |
+
try:
|
| 229 |
+
from semanticscholar import SemanticScholar
|
| 230 |
+
except ImportError:
|
| 231 |
+
global _SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED
|
| 232 |
+
if not _SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED:
|
| 233 |
+
logger.debug("semanticscholar package not installed; Semantic Scholar source disabled.")
|
| 234 |
+
_SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED = True
|
| 235 |
+
return []
|
| 236 |
+
|
| 237 |
+
time.sleep(self.delay) # Initial delay
|
| 238 |
+
|
| 239 |
+
try:
|
| 240 |
+
client = SemanticScholar(api_key=self.api_key)
|
| 241 |
+
results = client.search_paper(
|
| 242 |
+
query,
|
| 243 |
+
limit=limit,
|
| 244 |
+
fields=['title', 'abstract', 'authors', 'year', 'externalIds', 'url', 'isOpenAccess', 'openAccessPdf', 'venue', 'citationCount']
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
papers: List[PaperMetadata] = []
|
| 248 |
+
for item in results:
|
| 249 |
+
if len(papers) >= limit:
|
| 250 |
+
break
|
| 251 |
+
|
| 252 |
+
# Get PDF URL if available
|
| 253 |
+
pdf_url = None
|
| 254 |
+
if item.openAccessPdf and isinstance(item.openAccessPdf, dict):
|
| 255 |
+
pdf_url = item.openAccessPdf.get('url')
|
| 256 |
+
|
| 257 |
+
paper = PaperMetadata(
|
| 258 |
+
id=f"s2_{item.paperId}",
|
| 259 |
+
title=item.title or "Unknown",
|
| 260 |
+
authors=[a.name for a in (item.authors or [])],
|
| 261 |
+
year=item.year,
|
| 262 |
+
doi=item.externalIds.get("DOI") if item.externalIds else None,
|
| 263 |
+
abstract=item.abstract,
|
| 264 |
+
venue=getattr(item, "venue", None),
|
| 265 |
+
citation_count=getattr(item, "citationCount", None),
|
| 266 |
+
is_open_access=bool(getattr(item, "isOpenAccess", False)),
|
| 267 |
+
source=PaperSource.SEMANTIC_SCHOLAR,
|
| 268 |
+
url=item.url,
|
| 269 |
+
landing_url=item.url,
|
| 270 |
+
pdf_url=pdf_url,
|
| 271 |
+
)
|
| 272 |
+
papers.append(paper)
|
| 273 |
+
time.sleep(self.delay) # Rate limit between items
|
| 274 |
+
|
| 275 |
+
logger.info(f"Semantic Scholar returned {len(papers)} papers")
|
| 276 |
+
return papers
|
| 277 |
+
|
| 278 |
+
except Exception as e:
|
| 279 |
+
logger.warning(f"Semantic Scholar search failed (likely 403): {e}")
|
| 280 |
+
return []
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
class PaperDiscoveryAgent:
|
| 284 |
+
"""
|
| 285 |
+
Paper discovery agent.
|
| 286 |
+
Aggregates multiple search sources, deduplicates, and sorts results.
|
| 287 |
+
"""
|
| 288 |
+
|
| 289 |
+
def __init__(self) -> None:
|
| 290 |
+
config = get_config()
|
| 291 |
+
self.searchers: List[tuple] = []
|
| 292 |
+
|
| 293 |
+
if config.arxiv_enabled:
|
| 294 |
+
self.searchers.append(("arxiv", ArxivSearcher()))
|
| 295 |
+
if config.pubmed_enabled:
|
| 296 |
+
self.searchers.append(("pubmed", PubMedSearcher()))
|
| 297 |
+
if config.semantic_scholar_enabled:
|
| 298 |
+
self.searchers.append(("semantic_scholar", SemanticScholarSearcher()))
|
| 299 |
+
|
| 300 |
+
logger.info(f"Initialized PaperDiscoveryAgent with sources: {[s[0] for s in self.searchers]}")
|
| 301 |
+
|
| 302 |
+
def discover(
|
| 303 |
+
self,
|
| 304 |
+
query: str,
|
| 305 |
+
limit_per_source: int = 10,
|
| 306 |
+
deduplicate: bool = True
|
| 307 |
+
) -> List[PaperMetadata]:
|
| 308 |
+
"""
|
| 309 |
+
Search all sources and aggregate results.
|
| 310 |
+
|
| 311 |
+
Args:
|
| 312 |
+
query: Search query
|
| 313 |
+
limit_per_source: Maximum results per source
|
| 314 |
+
deduplicate: Whether to deduplicate by title
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
Aggregated list of papers
|
| 318 |
+
"""
|
| 319 |
+
all_papers: List[PaperMetadata] = []
|
| 320 |
+
|
| 321 |
+
for source_name, searcher in self.searchers:
|
| 322 |
+
try:
|
| 323 |
+
papers = searcher.search(query, limit_per_source)
|
| 324 |
+
all_papers.extend(papers)
|
| 325 |
+
logger.info(f"{source_name} returned {len(papers)} papers")
|
| 326 |
+
except Exception as e:
|
| 327 |
+
logger.error(f"Search failed for {source_name}: {e}")
|
| 328 |
+
|
| 329 |
+
logger.info(f"Total papers before deduplication: {len(all_papers)}")
|
| 330 |
+
|
| 331 |
+
if deduplicate:
|
| 332 |
+
all_papers = self._deduplicate(all_papers)
|
| 333 |
+
logger.info(f"Total papers after deduplication: {len(all_papers)}")
|
| 334 |
+
|
| 335 |
+
return all_papers
|
| 336 |
+
|
| 337 |
+
def _deduplicate(self, papers: List[PaperMetadata]) -> List[PaperMetadata]:
|
| 338 |
+
"""Deduplicate papers by normalized title."""
|
| 339 |
+
seen_titles: set = set()
|
| 340 |
+
unique_papers: List[PaperMetadata] = []
|
| 341 |
+
|
| 342 |
+
for paper in papers:
|
| 343 |
+
# Normalize title for comparison
|
| 344 |
+
normalized = paper.title.lower().strip()
|
| 345 |
+
if normalized not in seen_titles:
|
| 346 |
+
seen_titles.add(normalized)
|
| 347 |
+
unique_papers.append(paper)
|
| 348 |
+
|
| 349 |
+
return unique_papers
|
| 350 |
+
|
| 351 |
+
def build_thermoelectric_query(
|
| 352 |
+
self,
|
| 353 |
+
polymer: Optional[str] = None,
|
| 354 |
+
include_tc: bool = True
|
| 355 |
+
) -> str:
|
| 356 |
+
"""
|
| 357 |
+
Build a specialized thermoelectric search query.
|
| 358 |
+
|
| 359 |
+
Args:
|
| 360 |
+
polymer: Specific polymer name (e.g., "P3HT")
|
| 361 |
+
include_tc: Whether to include thermal conductivity keywords
|
| 362 |
+
|
| 363 |
+
Returns:
|
| 364 |
+
Optimized search query string
|
| 365 |
+
"""
|
| 366 |
+
base_terms = [
|
| 367 |
+
"organic thermoelectric",
|
| 368 |
+
"conjugated polymer",
|
| 369 |
+
"electrical conductivity",
|
| 370 |
+
]
|
| 371 |
+
|
| 372 |
+
if include_tc:
|
| 373 |
+
base_terms.append("thermal conductivity")
|
| 374 |
+
|
| 375 |
+
if polymer:
|
| 376 |
+
base_terms.insert(0, polymer)
|
| 377 |
+
|
| 378 |
+
query = " ".join(base_terms)
|
| 379 |
+
logger.debug(f"Built query: {query}")
|
| 380 |
+
return query
|
literature/evaluation.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Offline evaluation helpers for structured literature extraction.
|
| 3 |
+
|
| 4 |
+
The harness is intentionally dataset-agnostic so POLYIE-formatted exports and
|
| 5 |
+
internal regression sets can share the same metric implementation.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any, Dict, Iterable, List, Sequence, Tuple
|
| 12 |
+
|
| 13 |
+
from .property_registry import normalize_property_key
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
CORE_FIELDS = ["material_name", "property_key", "raw_value", "raw_unit", "method"]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def load_json_records(path: str | Path) -> List[Dict[str, Any]]:
|
| 20 |
+
fp = Path(path)
|
| 21 |
+
if fp.suffix == ".jsonl":
|
| 22 |
+
return [json.loads(line) for line in fp.read_text(encoding="utf-8").splitlines() if line.strip()]
|
| 23 |
+
data = json.loads(fp.read_text(encoding="utf-8"))
|
| 24 |
+
if isinstance(data, list):
|
| 25 |
+
return data
|
| 26 |
+
raise ValueError(f"Unsupported evaluation file format: {fp}")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def normalize_record(record: Dict[str, Any]) -> Dict[str, Any]:
|
| 30 |
+
material = str(
|
| 31 |
+
record.get("material_name")
|
| 32 |
+
or record.get("polymer_name")
|
| 33 |
+
or record.get("material")
|
| 34 |
+
or ""
|
| 35 |
+
).strip()
|
| 36 |
+
property_key = normalize_property_key(
|
| 37 |
+
str(record.get("property_key") or record.get("property_name") or "")
|
| 38 |
+
) or str(record.get("property_key") or record.get("property_name") or "").strip()
|
| 39 |
+
raw_value = str(record.get("raw_value") or record.get("value") or "").strip()
|
| 40 |
+
raw_unit = str(record.get("raw_unit") or record.get("unit") or "").strip()
|
| 41 |
+
method = str(record.get("method") or record.get("measurement_method") or "").strip()
|
| 42 |
+
evidence_quote = str(record.get("evidence_quote") or record.get("source_quote") or "").strip()
|
| 43 |
+
return {
|
| 44 |
+
"material_name": material,
|
| 45 |
+
"property_key": property_key,
|
| 46 |
+
"raw_value": raw_value,
|
| 47 |
+
"raw_unit": raw_unit,
|
| 48 |
+
"method": method,
|
| 49 |
+
"evidence_quote": evidence_quote,
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _safe_div(numerator: float, denominator: float) -> float:
|
| 54 |
+
return numerator / denominator if denominator else 0.0
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _f1(precision: float, recall: float) -> float:
|
| 58 |
+
return (2 * precision * recall) / (precision + recall) if (precision + recall) else 0.0
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _field_pairs(records: Sequence[Dict[str, Any]], field: str) -> set[Tuple[str, str]]:
|
| 62 |
+
pairs = set()
|
| 63 |
+
for record in records:
|
| 64 |
+
normalized = normalize_record(record)
|
| 65 |
+
key = normalized.get("material_name", "")
|
| 66 |
+
value = normalized.get(field, "")
|
| 67 |
+
if key and value:
|
| 68 |
+
pairs.add((key.lower(), value.lower()))
|
| 69 |
+
return pairs
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _relation_tuples(records: Sequence[Dict[str, Any]]) -> set[Tuple[str, str, str]]:
|
| 73 |
+
triples = set()
|
| 74 |
+
for record in records:
|
| 75 |
+
normalized = normalize_record(record)
|
| 76 |
+
if normalized["material_name"] and normalized["property_key"] and normalized["raw_value"]:
|
| 77 |
+
triples.add(
|
| 78 |
+
(
|
| 79 |
+
normalized["material_name"].lower(),
|
| 80 |
+
normalized["property_key"].lower(),
|
| 81 |
+
normalized["raw_value"].lower(),
|
| 82 |
+
)
|
| 83 |
+
)
|
| 84 |
+
return triples
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _record_tuples(records: Sequence[Dict[str, Any]]) -> set[Tuple[str, str, str, str, str]]:
|
| 88 |
+
tuples = set()
|
| 89 |
+
for record in records:
|
| 90 |
+
normalized = normalize_record(record)
|
| 91 |
+
tuples.add(
|
| 92 |
+
tuple(normalized[field].lower() for field in CORE_FIELDS)
|
| 93 |
+
)
|
| 94 |
+
return tuples
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def evaluate_predictions(
|
| 98 |
+
gold_records: Sequence[Dict[str, Any]],
|
| 99 |
+
predicted_records: Sequence[Dict[str, Any]],
|
| 100 |
+
) -> Dict[str, Any]:
|
| 101 |
+
gold = [normalize_record(record) for record in gold_records]
|
| 102 |
+
predicted = [normalize_record(record) for record in predicted_records]
|
| 103 |
+
|
| 104 |
+
field_metrics: Dict[str, Dict[str, float]] = {}
|
| 105 |
+
for field in CORE_FIELDS:
|
| 106 |
+
gold_pairs = _field_pairs(gold, field)
|
| 107 |
+
predicted_pairs = _field_pairs(predicted, field)
|
| 108 |
+
tp = len(gold_pairs & predicted_pairs)
|
| 109 |
+
precision = _safe_div(tp, len(predicted_pairs))
|
| 110 |
+
recall = _safe_div(tp, len(gold_pairs))
|
| 111 |
+
field_metrics[field] = {
|
| 112 |
+
"precision": precision,
|
| 113 |
+
"recall": recall,
|
| 114 |
+
"f1": _f1(precision, recall),
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
gold_rel = _relation_tuples(gold)
|
| 118 |
+
pred_rel = _relation_tuples(predicted)
|
| 119 |
+
rel_tp = len(gold_rel & pred_rel)
|
| 120 |
+
rel_precision = _safe_div(rel_tp, len(pred_rel))
|
| 121 |
+
rel_recall = _safe_div(rel_tp, len(gold_rel))
|
| 122 |
+
|
| 123 |
+
gold_records_set = _record_tuples(gold)
|
| 124 |
+
pred_records_set = _record_tuples(predicted)
|
| 125 |
+
record_tp = len(gold_records_set & pred_records_set)
|
| 126 |
+
record_precision = _safe_div(record_tp, len(pred_records_set))
|
| 127 |
+
record_recall = _safe_div(record_tp, len(gold_records_set))
|
| 128 |
+
|
| 129 |
+
filled_fields = [
|
| 130 |
+
sum(1 for field in CORE_FIELDS if record.get(field))
|
| 131 |
+
for record in predicted
|
| 132 |
+
]
|
| 133 |
+
record_completeness = _safe_div(sum(filled_fields), len(predicted) * len(CORE_FIELDS))
|
| 134 |
+
source_grounding_hit_rate = _safe_div(
|
| 135 |
+
sum(1 for record in predicted if record.get("evidence_quote")),
|
| 136 |
+
len(predicted),
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
return {
|
| 140 |
+
"field_metrics": field_metrics,
|
| 141 |
+
"relation_level": {
|
| 142 |
+
"precision": rel_precision,
|
| 143 |
+
"recall": rel_recall,
|
| 144 |
+
"f1": _f1(rel_precision, rel_recall),
|
| 145 |
+
},
|
| 146 |
+
"record_level": {
|
| 147 |
+
"precision": record_precision,
|
| 148 |
+
"recall": record_recall,
|
| 149 |
+
"f1": _f1(record_precision, record_recall),
|
| 150 |
+
},
|
| 151 |
+
"record_completeness": record_completeness,
|
| 152 |
+
"source_grounding_hit_rate": source_grounding_hit_rate,
|
| 153 |
+
"gold_count": len(gold),
|
| 154 |
+
"predicted_count": len(predicted),
|
| 155 |
+
}
|
literature/extraction.py
ADDED
|
@@ -0,0 +1,863 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM-based structured data extraction module.
|
| 3 |
+
Implements flexible interface: PageIndex (RAG via indexed PDFs) or Simple extraction (fallback).
|
| 4 |
+
|
| 5 |
+
Prompts are dynamically built from user-selected target properties via
|
| 6 |
+
``literature.property_registry.build_extraction_prompt``.
|
| 7 |
+
"""
|
| 8 |
+
import json
|
| 9 |
+
import re
|
| 10 |
+
import logging
|
| 11 |
+
import os
|
| 12 |
+
from typing import List, Optional, Any
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
|
| 15 |
+
from .schemas import (
|
| 16 |
+
PaperMetadata,
|
| 17 |
+
PolymerDataPoint,
|
| 18 |
+
ExtractionResult,
|
| 19 |
+
DataQuality
|
| 20 |
+
)
|
| 21 |
+
from .config import get_config
|
| 22 |
+
from .retrieval import extract_text_from_pdf
|
| 23 |
+
from .property_registry import PROPERTY_CATALOG, build_extraction_prompt, TEMPLATES
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
# Default property set used when no explicit target properties are provided.
|
| 28 |
+
# The legacy thermoelectric-only template no longer exists in the production
|
| 29 |
+
# registry, so fall back to the platform-wide property core.
|
| 30 |
+
_DEFAULT_PROPERTIES = TEMPLATES.get("platform_core") or list(PROPERTY_CATALOG.keys())
|
| 31 |
+
|
| 32 |
+
_SKIP_ERROR_MESSAGES = {
|
| 33 |
+
"llm_unconfigured",
|
| 34 |
+
"contextual_llm_unconfigured",
|
| 35 |
+
"extraction_backend_unconfigured",
|
| 36 |
+
"pageindex_requires_pdf_no_simple_backend",
|
| 37 |
+
"pageindex_sdk_unavailable_no_simple_backend",
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _normalize_base_url(url: Optional[str]) -> Optional[str]:
|
| 42 |
+
text = str(url or "").strip().rstrip("/")
|
| 43 |
+
return text or None
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _is_http_url(url: Optional[str]) -> bool:
|
| 47 |
+
text = _normalize_base_url(url)
|
| 48 |
+
return bool(text and (text.startswith("http://") or text.startswith("https://")))
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def is_expected_skip_error(error_message: Optional[str]) -> bool:
|
| 52 |
+
return str(error_message or "").strip() in _SKIP_ERROR_MESSAGES
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ============== JSON Safe Parsing (Fix Logic Bug #4 & #5) ==============
|
| 56 |
+
|
| 57 |
+
def normalize_minus_signs(s: str) -> str:
|
| 58 |
+
"""
|
| 59 |
+
Normalize all types of minus signs to ASCII minus.
|
| 60 |
+
|
| 61 |
+
Fixes Logic Bug #5: OCR may produce Unicode minus (U+2212) instead of ASCII.
|
| 62 |
+
"""
|
| 63 |
+
minus_chars = [
|
| 64 |
+
'−', # U+2212 MINUS SIGN
|
| 65 |
+
'–', # U+2013 EN DASH
|
| 66 |
+
'—', # U+2014 EM DASH
|
| 67 |
+
'‐', # U+2010 HYPHEN
|
| 68 |
+
'‑', # U+2011 NON-BREAKING HYPHEN
|
| 69 |
+
'‒', # U+2012 FIGURE DASH
|
| 70 |
+
'⁻', # U+207B SUPERSCRIPT MINUS
|
| 71 |
+
'₋', # U+208B SUBSCRIPT MINUS
|
| 72 |
+
]
|
| 73 |
+
for char in minus_chars:
|
| 74 |
+
s = s.replace(char, '-')
|
| 75 |
+
return s
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def safe_json_loads(text: str) -> Any:
|
| 79 |
+
"""
|
| 80 |
+
Safely parse JSON, handling common LLM output issues.
|
| 81 |
+
|
| 82 |
+
Fixes Logic Bug #4: LLM may return NaN, Infinity, Python-style None, trailing commas.
|
| 83 |
+
"""
|
| 84 |
+
if not text:
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
text = text.strip()
|
| 88 |
+
|
| 89 |
+
# Extract JSON from markdown code blocks
|
| 90 |
+
if "```json" in text:
|
| 91 |
+
text = text.split("```json")[1].split("```")[0]
|
| 92 |
+
elif "```" in text:
|
| 93 |
+
parts = text.split("```")
|
| 94 |
+
if len(parts) >= 2:
|
| 95 |
+
text = parts[1]
|
| 96 |
+
|
| 97 |
+
# Normalize minus signs
|
| 98 |
+
text = normalize_minus_signs(text)
|
| 99 |
+
|
| 100 |
+
# Fix Python-style -> JSON-style
|
| 101 |
+
text = re.sub(r'\bNone\b', 'null', text)
|
| 102 |
+
text = re.sub(r'\bTrue\b', 'true', text)
|
| 103 |
+
text = re.sub(r'\bFalse\b', 'false', text)
|
| 104 |
+
|
| 105 |
+
# Remove trailing commas
|
| 106 |
+
text = re.sub(r',\s*}', '}', text)
|
| 107 |
+
text = re.sub(r',\s*]', ']', text)
|
| 108 |
+
|
| 109 |
+
# Handle NaN and Infinity
|
| 110 |
+
text = re.sub(r'\bNaN\b', 'null', text)
|
| 111 |
+
text = re.sub(r'\bInfinity\b', 'null', text)
|
| 112 |
+
text = re.sub(r'-Infinity\b', 'null', text)
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
return json.loads(text)
|
| 116 |
+
except json.JSONDecodeError as e:
|
| 117 |
+
logger.warning(f"Initial JSON parse failed: {e}")
|
| 118 |
+
|
| 119 |
+
# Try json_repair if available
|
| 120 |
+
try:
|
| 121 |
+
from json_repair import repair_json
|
| 122 |
+
repaired = repair_json(text)
|
| 123 |
+
return json.loads(repaired)
|
| 124 |
+
except ImportError:
|
| 125 |
+
logger.warning("json_repair not installed, cannot repair JSON")
|
| 126 |
+
raise
|
| 127 |
+
except Exception as e2:
|
| 128 |
+
logger.error(f"JSON repair also failed: {e2}")
|
| 129 |
+
raise
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# Extraction prompt template
|
| 135 |
+
EXTRACTION_PROMPT = """
|
| 136 |
+
You are an expert in organic thermoelectrics and polymer science.
|
| 137 |
+
Your task is to extract ALL experimental data points from the provided paper.
|
| 138 |
+
|
| 139 |
+
## Target Data
|
| 140 |
+
Extract data for conjugated polymers used in thermoelectric applications, including:
|
| 141 |
+
- PEDOT:PSS, P3HT, PBTTT, P(NDI2OD-T2), PDPP series, etc.
|
| 142 |
+
|
| 143 |
+
## Required Fields (extract as many as available)
|
| 144 |
+
For EACH data point, extract:
|
| 145 |
+
|
| 146 |
+
### Material Information
|
| 147 |
+
- polymer_name: The polymer name/abbreviation (e.g., "P3HT", "PEDOT:PSS")
|
| 148 |
+
- dopant: Dopant used (e.g., "DMSO", "H2SO4", "FeCl3")
|
| 149 |
+
- dopant_ratio: Dopant concentration if specified (e.g., "5 wt%", "1 M")
|
| 150 |
+
|
| 151 |
+
### Processing Conditions
|
| 152 |
+
- solvent: Solvent used for film preparation
|
| 153 |
+
- concentration_mg_ml: Solution concentration in mg/mL
|
| 154 |
+
- spin_speed_rpm: Spin coating speed in RPM
|
| 155 |
+
- spin_time_s: Spin coating time in seconds
|
| 156 |
+
- annealing_temp_c: Annealing temperature in Celsius
|
| 157 |
+
- annealing_time_min: Annealing time in minutes
|
| 158 |
+
- annealing_atmosphere: Atmosphere during annealing (N2, Air, Vacuum)
|
| 159 |
+
- film_thickness_nm: Film thickness in nanometers
|
| 160 |
+
|
| 161 |
+
### Electrical Properties
|
| 162 |
+
- electrical_conductivity_s_cm: Electrical conductivity in S/cm
|
| 163 |
+
- seebeck_coefficient_uv_k: Seebeck coefficient in μV/K
|
| 164 |
+
- power_factor_uw_m_k2: Power factor in μW/(m·K²)
|
| 165 |
+
|
| 166 |
+
### Thermal Properties (IMPORTANT - often sparse)
|
| 167 |
+
- thermal_conductivity_w_mk: Thermal conductivity in W/(m·K)
|
| 168 |
+
- zt_figure_of_merit: ZT figure of merit (dimensionless)
|
| 169 |
+
|
| 170 |
+
### Structural Characterization
|
| 171 |
+
- xrd_crystallinity_percent: Crystallinity percentage from XRD
|
| 172 |
+
- xrd_pi_stacking_angstrom: π-π stacking distance in Angstrom
|
| 173 |
+
- xrd_lamellar_spacing_angstrom: Lamellar spacing in Angstrom
|
| 174 |
+
|
| 175 |
+
### Metadata
|
| 176 |
+
- source_table_or_figure: Where the data was found (e.g., "Table 1", "Figure 3")
|
| 177 |
+
- extraction_confidence: Your confidence in this extraction (0.0 to 1.0)
|
| 178 |
+
|
| 179 |
+
## CRITICAL Rules
|
| 180 |
+
1. Extract ONLY experimentally measured values, not theoretical predictions
|
| 181 |
+
2. Convert all units to the specified standard units
|
| 182 |
+
3. If a value range is given (e.g., "100-200 S/cm"), use the AVERAGE
|
| 183 |
+
4. If a value is "not measured" or "N/A", use null
|
| 184 |
+
5. Each row in a table = one data point
|
| 185 |
+
6. Include the source_table_or_figure for traceability
|
| 186 |
+
|
| 187 |
+
## Output Format
|
| 188 |
+
Return a valid JSON array. Example:
|
| 189 |
+
[
|
| 190 |
+
{
|
| 191 |
+
"polymer_name": "PEDOT:PSS",
|
| 192 |
+
"dopant": "H2SO4",
|
| 193 |
+
"dopant_ratio": "5 vol%",
|
| 194 |
+
"electrical_conductivity_s_cm": 1200.5,
|
| 195 |
+
"thermal_conductivity_w_mk": 0.35,
|
| 196 |
+
"source_table_or_figure": "Table 2",
|
| 197 |
+
"extraction_confidence": 0.9
|
| 198 |
+
}
|
| 199 |
+
]
|
| 200 |
+
|
| 201 |
+
Return ONLY the JSON array, no markdown formatting, no explanations.
|
| 202 |
+
If no relevant data is found, return an empty array: []
|
| 203 |
+
"""
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
class DataExtractor:
|
| 207 |
+
"""
|
| 208 |
+
Flexible data extractor with fallback strategy.
|
| 209 |
+
|
| 210 |
+
Primary: PageIndex (RAG via indexed PDFs)
|
| 211 |
+
Fallback: Simple extraction (pymupdf + direct LLM)
|
| 212 |
+
"""
|
| 213 |
+
|
| 214 |
+
def __init__(
|
| 215 |
+
self,
|
| 216 |
+
strategy: Optional[str] = None,
|
| 217 |
+
target_properties: Optional[List[str]] = None,
|
| 218 |
+
extra_instructions: str = "",
|
| 219 |
+
) -> None:
|
| 220 |
+
config = get_config()
|
| 221 |
+
self.strategy = strategy or config.extraction_strategy
|
| 222 |
+
self.llm_model = config.llm_model
|
| 223 |
+
self.gemini_key = config.gemini_api_key
|
| 224 |
+
self.openai_key = config.openai_api_key
|
| 225 |
+
self.openai_base_url = _normalize_base_url(config.openai_base_url)
|
| 226 |
+
self.pdf_dir = config.pdf_storage_dir
|
| 227 |
+
self.pageindex_api_key = config.pageindex_api_key
|
| 228 |
+
self.target_properties = target_properties or _DEFAULT_PROPERTIES
|
| 229 |
+
self.extra_instructions = extra_instructions
|
| 230 |
+
|
| 231 |
+
logger.info(f"Initialized DataExtractor with strategy: {self.strategy}, properties: {self.target_properties}")
|
| 232 |
+
|
| 233 |
+
def has_openai_backend(self) -> bool:
|
| 234 |
+
return _is_http_url(self.openai_base_url)
|
| 235 |
+
|
| 236 |
+
def has_any_llm_backend(self) -> bool:
|
| 237 |
+
return self.has_openai_backend() or bool(str(self.gemini_key or "").strip())
|
| 238 |
+
|
| 239 |
+
def has_pageindex_backend(self) -> bool:
|
| 240 |
+
return bool(str(self.pageindex_api_key or "").strip())
|
| 241 |
+
|
| 242 |
+
def can_attempt_extraction(self) -> bool:
|
| 243 |
+
return self.has_pageindex_backend() or self.has_any_llm_backend()
|
| 244 |
+
|
| 245 |
+
def availability_reason(self) -> Optional[str]:
|
| 246 |
+
if self.can_attempt_extraction():
|
| 247 |
+
return None
|
| 248 |
+
return "Structured extraction skipped: configure PAGEINDEX_API_KEY or a valid LLM backend."
|
| 249 |
+
|
| 250 |
+
def extract_from_papers(
|
| 251 |
+
self,
|
| 252 |
+
papers: List[PaperMetadata],
|
| 253 |
+
use_full_text: bool = True
|
| 254 |
+
) -> List[ExtractionResult]:
|
| 255 |
+
"""
|
| 256 |
+
Extract data from multiple papers.
|
| 257 |
+
|
| 258 |
+
Args:
|
| 259 |
+
papers: List of paper metadata (with pdf_path if available)
|
| 260 |
+
use_full_text: Use PDF full text if available
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
List of extraction results
|
| 264 |
+
"""
|
| 265 |
+
results: List[ExtractionResult] = []
|
| 266 |
+
|
| 267 |
+
for paper in papers:
|
| 268 |
+
try:
|
| 269 |
+
if self.strategy == "pageindex":
|
| 270 |
+
result = self._extract_with_pageindex(paper)
|
| 271 |
+
else:
|
| 272 |
+
result = self._extract_simple(paper, use_full_text)
|
| 273 |
+
results.append(result)
|
| 274 |
+
except Exception as e:
|
| 275 |
+
logger.error(f"Extraction failed for {paper.id}: {e}")
|
| 276 |
+
results.append(ExtractionResult(
|
| 277 |
+
paper=paper,
|
| 278 |
+
success=False,
|
| 279 |
+
error_message=str(e)
|
| 280 |
+
))
|
| 281 |
+
|
| 282 |
+
return results
|
| 283 |
+
|
| 284 |
+
def _extract_simple(
|
| 285 |
+
self,
|
| 286 |
+
paper: PaperMetadata,
|
| 287 |
+
use_full_text: bool = True
|
| 288 |
+
) -> ExtractionResult:
|
| 289 |
+
"""
|
| 290 |
+
Simple extraction: Extract PDF text -> Feed to LLM -> Parse JSON.
|
| 291 |
+
Often more effective for metadata extraction.
|
| 292 |
+
"""
|
| 293 |
+
logger.info(f"Simple extraction for: {paper.title[:50]}...")
|
| 294 |
+
|
| 295 |
+
# Get content
|
| 296 |
+
content = self._prepare_content(paper, use_full_text)
|
| 297 |
+
if not content:
|
| 298 |
+
return ExtractionResult(
|
| 299 |
+
paper=paper,
|
| 300 |
+
success=False,
|
| 301 |
+
error_message="No content available"
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
if not self.has_any_llm_backend():
|
| 305 |
+
return ExtractionResult(
|
| 306 |
+
paper=paper,
|
| 307 |
+
success=False,
|
| 308 |
+
error_message="llm_unconfigured",
|
| 309 |
+
extraction_notes="Simple extraction skipped because no LLM backend is configured.",
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
# Call LLM with dynamic prompt
|
| 313 |
+
dynamic_prompt = build_extraction_prompt(self.target_properties, self.extra_instructions)
|
| 314 |
+
prompt = dynamic_prompt.replace("{title}", paper.title or "Unknown").replace("{content}", content)
|
| 315 |
+
|
| 316 |
+
try:
|
| 317 |
+
raw_response = self._call_llm(prompt)
|
| 318 |
+
|
| 319 |
+
if not raw_response:
|
| 320 |
+
return ExtractionResult(
|
| 321 |
+
paper=paper,
|
| 322 |
+
success=False,
|
| 323 |
+
error_message="LLM returned empty response"
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
# Parse response
|
| 327 |
+
data_points = self._parse_llm_output(raw_response, paper.id)
|
| 328 |
+
|
| 329 |
+
# Assess quality for each point
|
| 330 |
+
for dp in data_points:
|
| 331 |
+
dp.quality_tier = self._assess_quality(dp)
|
| 332 |
+
|
| 333 |
+
return ExtractionResult(
|
| 334 |
+
paper=paper,
|
| 335 |
+
data_points=data_points,
|
| 336 |
+
llm_model_used=self.llm_model,
|
| 337 |
+
extraction_timestamp=datetime.now(),
|
| 338 |
+
success=True
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
except Exception as e:
|
| 342 |
+
logger.error(f"Simple extraction failed: {e}")
|
| 343 |
+
return ExtractionResult(
|
| 344 |
+
paper=paper,
|
| 345 |
+
success=False,
|
| 346 |
+
error_message=str(e)
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
def _extract_with_pageindex(self, paper: PaperMetadata) -> ExtractionResult:
|
| 350 |
+
"""
|
| 351 |
+
PageIndex extraction (RAG-enhanced via indexed PDF).
|
| 352 |
+
Submits PDF to PageIndex, then uses chat_completions with extraction prompt.
|
| 353 |
+
Falls back to simple extraction if PageIndex is unavailable or fails.
|
| 354 |
+
"""
|
| 355 |
+
if not self.has_pageindex_backend():
|
| 356 |
+
if self.has_any_llm_backend():
|
| 357 |
+
return self._extract_simple(paper)
|
| 358 |
+
return ExtractionResult(
|
| 359 |
+
paper=paper,
|
| 360 |
+
success=False,
|
| 361 |
+
error_message="extraction_backend_unconfigured",
|
| 362 |
+
extraction_notes="No PageIndex or LLM backend is configured.",
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
if not paper.pdf_path or not os.path.exists(paper.pdf_path):
|
| 366 |
+
if self.has_any_llm_backend():
|
| 367 |
+
return self._extract_simple(paper)
|
| 368 |
+
return ExtractionResult(
|
| 369 |
+
paper=paper,
|
| 370 |
+
success=False,
|
| 371 |
+
error_message="pageindex_requires_pdf_no_simple_backend",
|
| 372 |
+
extraction_notes="PageIndex extraction requires a PDF when no simple LLM fallback is available.",
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
try:
|
| 376 |
+
from src.literature_service.pageindex_client import PageIndexService
|
| 377 |
+
except ImportError:
|
| 378 |
+
if self.has_any_llm_backend():
|
| 379 |
+
return self._extract_simple(paper)
|
| 380 |
+
return ExtractionResult(
|
| 381 |
+
paper=paper,
|
| 382 |
+
success=False,
|
| 383 |
+
error_message="pageindex_sdk_unavailable_no_simple_backend",
|
| 384 |
+
extraction_notes="PageIndex SDK unavailable and no simple LLM fallback is configured.",
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
logger.info(f"PageIndex extraction for: {paper.title[:50]}...")
|
| 388 |
+
|
| 389 |
+
try:
|
| 390 |
+
service = PageIndexService(api_key=self.pageindex_api_key)
|
| 391 |
+
|
| 392 |
+
# Submit the document to PageIndex
|
| 393 |
+
doc_id = service.submit_document(paper.pdf_path)
|
| 394 |
+
logger.info(f"Submitted to PageIndex, doc_id={doc_id}")
|
| 395 |
+
|
| 396 |
+
# Wait for indexing to complete (poll status)
|
| 397 |
+
import time
|
| 398 |
+
for _ in range(30): # max ~60 seconds
|
| 399 |
+
status = service.get_document_status(doc_id)
|
| 400 |
+
if status == "completed":
|
| 401 |
+
break
|
| 402 |
+
if status in ("error", "failed"):
|
| 403 |
+
raise RuntimeError(f"PageIndex indexing failed with status: {status}")
|
| 404 |
+
time.sleep(2)
|
| 405 |
+
else:
|
| 406 |
+
logger.warning("PageIndex indexing timed out, falling back to simple")
|
| 407 |
+
return self._extract_simple(paper)
|
| 408 |
+
|
| 409 |
+
# Use chat_completions with dynamic extraction prompt
|
| 410 |
+
dynamic_prompt = build_extraction_prompt(self.target_properties, self.extra_instructions)
|
| 411 |
+
# For PageIndex chat, we don't need the {title}/{content} placeholders
|
| 412 |
+
# since the document is already indexed; strip those sections.
|
| 413 |
+
pi_prompt = dynamic_prompt.split("**PAPER CONTENT:**")[0].strip()
|
| 414 |
+
raw_answer = service.chat_completions(pi_prompt, doc_id)
|
| 415 |
+
|
| 416 |
+
if not raw_answer:
|
| 417 |
+
return ExtractionResult(
|
| 418 |
+
paper=paper,
|
| 419 |
+
success=False,
|
| 420 |
+
error_message="PageIndex returned empty response"
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
+
# Parse result
|
| 424 |
+
data_points = self._parse_llm_output(raw_answer, paper.id)
|
| 425 |
+
|
| 426 |
+
for dp in data_points:
|
| 427 |
+
dp.quality_tier = self._assess_quality(dp)
|
| 428 |
+
|
| 429 |
+
return ExtractionResult(
|
| 430 |
+
paper=paper,
|
| 431 |
+
data_points=data_points,
|
| 432 |
+
llm_model_used="pageindex",
|
| 433 |
+
extraction_timestamp=datetime.now(),
|
| 434 |
+
success=True
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
except Exception as e:
|
| 438 |
+
logger.warning(f"PageIndex extraction failed, falling back to simple: {e}")
|
| 439 |
+
return self._extract_simple(paper)
|
| 440 |
+
|
| 441 |
+
def _prepare_content(
|
| 442 |
+
self,
|
| 443 |
+
paper: PaperMetadata,
|
| 444 |
+
use_full_text: bool = True
|
| 445 |
+
) -> Optional[str]:
|
| 446 |
+
"""Prepare text content for extraction."""
|
| 447 |
+
# Try PDF full text first
|
| 448 |
+
if use_full_text and paper.pdf_path and os.path.exists(paper.pdf_path):
|
| 449 |
+
full_text = extract_text_from_pdf(paper.pdf_path, max_pages=5)
|
| 450 |
+
if full_text:
|
| 451 |
+
return f"Title: {paper.title}\n\n{full_text}"
|
| 452 |
+
|
| 453 |
+
# Fallback to abstract
|
| 454 |
+
if paper.abstract:
|
| 455 |
+
return f"Title: {paper.title}\n\nAbstract:\n{paper.abstract}"
|
| 456 |
+
|
| 457 |
+
# Just title
|
| 458 |
+
if paper.title:
|
| 459 |
+
return f"Title: {paper.title}"
|
| 460 |
+
|
| 461 |
+
return None
|
| 462 |
+
|
| 463 |
+
def _call_llm(self, prompt: str) -> Optional[str]:
|
| 464 |
+
"""
|
| 465 |
+
Call LLM (OpenAI-compatible first, then Gemini fallback).
|
| 466 |
+
Prioritizes CRC OpenWebUI for reliability.
|
| 467 |
+
"""
|
| 468 |
+
# Try OpenAI-compatible (CRC) first
|
| 469 |
+
if self.openai_key and self.openai_base_url:
|
| 470 |
+
try:
|
| 471 |
+
logger.info(f"Calling CRC OpenWebUI...")
|
| 472 |
+
return self._call_openai_compatible(prompt)
|
| 473 |
+
except Exception as e:
|
| 474 |
+
logger.warning(f"CRC OpenWebUI call failed: {e}")
|
| 475 |
+
|
| 476 |
+
# Fallback to Gemini
|
| 477 |
+
if self.gemini_key:
|
| 478 |
+
try:
|
| 479 |
+
logger.info("Falling back to Gemini...")
|
| 480 |
+
return self._call_gemini(prompt)
|
| 481 |
+
except Exception as e:
|
| 482 |
+
logger.warning(f"Gemini call failed: {e}")
|
| 483 |
+
|
| 484 |
+
logger.debug("No LLM backend configured; skipping simple extraction call.")
|
| 485 |
+
return None
|
| 486 |
+
|
| 487 |
+
def _call_gemini(self, prompt: str) -> str:
|
| 488 |
+
"""Call Gemini API."""
|
| 489 |
+
import google.generativeai as genai
|
| 490 |
+
|
| 491 |
+
genai.configure(api_key=self.gemini_key)
|
| 492 |
+
model = genai.GenerativeModel("gemini-2.0-flash")
|
| 493 |
+
|
| 494 |
+
response = model.generate_content(prompt)
|
| 495 |
+
return response.text
|
| 496 |
+
|
| 497 |
+
def _call_openai_compatible(self, prompt: str) -> str:
|
| 498 |
+
"""Call OpenAI-compatible API (CRC OpenWebUI)."""
|
| 499 |
+
from openai import OpenAI
|
| 500 |
+
|
| 501 |
+
client = OpenAI(
|
| 502 |
+
api_key=self.openai_key,
|
| 503 |
+
base_url=self.openai_base_url
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
+
# Use model from config (set in .env LLM_MODEL)
|
| 507 |
+
model = self.llm_model
|
| 508 |
+
# Handle litellm-style prefixes
|
| 509 |
+
if model.startswith("gemini/"):
|
| 510 |
+
model = "gpt-oss:latest" # Fallback for CRC
|
| 511 |
+
logger.info(f"Using model: {model}")
|
| 512 |
+
|
| 513 |
+
response = client.chat.completions.create(
|
| 514 |
+
model=model,
|
| 515 |
+
messages=[{"role": "user", "content": prompt}],
|
| 516 |
+
temperature=0.1
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
return response.choices[0].message.content
|
| 520 |
+
|
| 521 |
+
def _parse_llm_output(
|
| 522 |
+
self,
|
| 523 |
+
raw_output: str,
|
| 524 |
+
paper_id: str
|
| 525 |
+
) -> List[PolymerDataPoint]:
|
| 526 |
+
"""Parse LLM output into structured data points."""
|
| 527 |
+
try:
|
| 528 |
+
# Use safe_json_loads for robust parsing
|
| 529 |
+
raw_data = safe_json_loads(raw_output)
|
| 530 |
+
except Exception as e:
|
| 531 |
+
logger.error(f"JSON parsing failed for {paper_id}: {e}")
|
| 532 |
+
return []
|
| 533 |
+
|
| 534 |
+
if raw_data is None:
|
| 535 |
+
logger.warning(f"No JSON data found in output for {paper_id}")
|
| 536 |
+
return []
|
| 537 |
+
|
| 538 |
+
# Ensure it's a list
|
| 539 |
+
if not isinstance(raw_data, list):
|
| 540 |
+
raw_data = [raw_data]
|
| 541 |
+
|
| 542 |
+
# Convert to Pydantic models
|
| 543 |
+
data_points: List[PolymerDataPoint] = []
|
| 544 |
+
for item in raw_data:
|
| 545 |
+
try:
|
| 546 |
+
dp = PolymerDataPoint(
|
| 547 |
+
polymer_name=item.get("polymer_name", "Unknown"),
|
| 548 |
+
dopant=item.get("dopant"),
|
| 549 |
+
dopant_ratio=item.get("dopant_ratio"),
|
| 550 |
+
solvent=item.get("solvent"),
|
| 551 |
+
concentration_mg_ml=item.get("concentration_mg_ml"),
|
| 552 |
+
spin_speed_rpm=item.get("spin_speed_rpm"),
|
| 553 |
+
spin_time_s=item.get("spin_time_s"),
|
| 554 |
+
annealing_temp_c=item.get("annealing_temp_c"),
|
| 555 |
+
annealing_time_min=item.get("annealing_time_min"),
|
| 556 |
+
annealing_atmosphere=item.get("annealing_atmosphere"),
|
| 557 |
+
film_thickness_nm=item.get("film_thickness_nm"),
|
| 558 |
+
electrical_conductivity_s_cm=item.get("electrical_conductivity_s_cm"),
|
| 559 |
+
seebeck_coefficient_uv_k=item.get("seebeck_coefficient_uv_k"),
|
| 560 |
+
power_factor_uw_m_k2=item.get("power_factor_uw_m_k2"),
|
| 561 |
+
thermal_conductivity_w_mk=item.get("thermal_conductivity_w_mk"),
|
| 562 |
+
zt_figure_of_merit=item.get("zt_figure_of_merit"),
|
| 563 |
+
xrd_crystallinity_percent=item.get("xrd_crystallinity_percent"),
|
| 564 |
+
xrd_pi_stacking_angstrom=item.get("xrd_pi_stacking_angstrom"),
|
| 565 |
+
xrd_lamellar_spacing_angstrom=item.get("xrd_lamellar_spacing_angstrom"),
|
| 566 |
+
source_paper_id=paper_id,
|
| 567 |
+
source_table_or_figure=item.get("source_table_or_figure"),
|
| 568 |
+
extraction_confidence=item.get("extraction_confidence", 0.5),
|
| 569 |
+
)
|
| 570 |
+
data_points.append(dp)
|
| 571 |
+
except Exception as e:
|
| 572 |
+
logger.warning(f"Failed to parse data point: {e}")
|
| 573 |
+
|
| 574 |
+
logger.info(f"Extracted {len(data_points)} data points from {paper_id}")
|
| 575 |
+
return data_points
|
| 576 |
+
|
| 577 |
+
def _assess_quality(self, dp: PolymerDataPoint) -> DataQuality:
|
| 578 |
+
"""Assess data point quality tier."""
|
| 579 |
+
has_ec = dp.electrical_conductivity_s_cm is not None
|
| 580 |
+
has_tc = dp.thermal_conductivity_w_mk is not None
|
| 581 |
+
has_xrd = (dp.xrd_crystallinity_percent is not None or
|
| 582 |
+
dp.xrd_pi_stacking_angstrom is not None)
|
| 583 |
+
has_process = (dp.annealing_temp_c is not None and
|
| 584 |
+
dp.spin_speed_rpm is not None)
|
| 585 |
+
|
| 586 |
+
if has_ec and has_tc and has_xrd and has_process:
|
| 587 |
+
return DataQuality.GOLD
|
| 588 |
+
elif has_ec and (has_xrd or has_process):
|
| 589 |
+
return DataQuality.SILVER
|
| 590 |
+
else:
|
| 591 |
+
return DataQuality.BRONZE
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
# ============== NEW: Contextualized Extraction ==============
|
| 595 |
+
|
| 596 |
+
CONTEXTUALIZED_EXTRACTION_PROMPT = """
|
| 597 |
+
You are an expert in organic thermoelectrics and polymer science.
|
| 598 |
+
Extract ALL experimental data points from the provided paper.
|
| 599 |
+
|
| 600 |
+
## CRITICAL REQUIREMENTS
|
| 601 |
+
|
| 602 |
+
1. **Extract ALL values, not just the best one**
|
| 603 |
+
- A paper may report multiple values under different conditions
|
| 604 |
+
- Extract EACH value as a separate data point
|
| 605 |
+
|
| 606 |
+
2. **Include COMPLETE experimental conditions**
|
| 607 |
+
- Every value must have its associated conditions
|
| 608 |
+
- Common: temperature, annealing, doping level, measurement method
|
| 609 |
+
|
| 610 |
+
3. **MANDATORY: Include source quote**
|
| 611 |
+
- For EACH data point, include the exact sentence from the paper
|
| 612 |
+
- Quote must be >10 characters and reference the value
|
| 613 |
+
|
| 614 |
+
## TARGET PROPERTIES
|
| 615 |
+
|
| 616 |
+
- `electrical_conductivity` (S/cm, S/m)
|
| 617 |
+
- `thermal_conductivity` (W/mK)
|
| 618 |
+
- `seebeck_coefficient` (μV/K)
|
| 619 |
+
- `power_factor` (μW/mK²)
|
| 620 |
+
- `zt_figure_of_merit` (dimensionless)
|
| 621 |
+
|
| 622 |
+
## OUTPUT FORMAT (JSON Array)
|
| 623 |
+
|
| 624 |
+
Return ONLY valid JSON, no markdown, no explanation:
|
| 625 |
+
|
| 626 |
+
[
|
| 627 |
+
{{
|
| 628 |
+
"polymer_name": "PEDOT:PSS",
|
| 629 |
+
"dopant": "H2SO4",
|
| 630 |
+
"dopant_ratio": "5 vol%",
|
| 631 |
+
"property_name": "electrical_conductivity",
|
| 632 |
+
"raw_value": "4380",
|
| 633 |
+
"raw_unit": "S/cm",
|
| 634 |
+
"conditions": {{
|
| 635 |
+
"solvent": "water",
|
| 636 |
+
"annealing_temp_c": 150,
|
| 637 |
+
"annealing_time_min": 10,
|
| 638 |
+
"measurement_temp_k": 300,
|
| 639 |
+
"measurement_method": "4-point probe"
|
| 640 |
+
}},
|
| 641 |
+
"source_quote": "The electrical conductivity reached 4380 S/cm after H2SO4 treatment.",
|
| 642 |
+
"source_location": "Table 2, Sample S5",
|
| 643 |
+
"extraction_confidence": 0.95
|
| 644 |
+
}}
|
| 645 |
+
]
|
| 646 |
+
|
| 647 |
+
## RULES
|
| 648 |
+
|
| 649 |
+
1. If values range "from X to Y", extract BOTH as separate points
|
| 650 |
+
2. Preserve scientific notation as "5.2e3" or actual number
|
| 651 |
+
3. If no source quote found, set extraction_confidence < 0.5
|
| 652 |
+
4. Return ONLY valid JSON array, no other text
|
| 653 |
+
|
| 654 |
+
---
|
| 655 |
+
|
| 656 |
+
**PAPER CONTENT:**
|
| 657 |
+
|
| 658 |
+
Title: {title}
|
| 659 |
+
|
| 660 |
+
{content}
|
| 661 |
+
|
| 662 |
+
---
|
| 663 |
+
|
| 664 |
+
JSON output:
|
| 665 |
+
"""
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
class ContextualizedExtractor:
|
| 669 |
+
"""
|
| 670 |
+
Contextualized data extractor.
|
| 671 |
+
|
| 672 |
+
Produces ContextualizedValue objects with mandatory source quotes for traceability.
|
| 673 |
+
"""
|
| 674 |
+
|
| 675 |
+
def __init__(
|
| 676 |
+
self,
|
| 677 |
+
model_id: str = None,
|
| 678 |
+
target_properties: Optional[List[str]] = None,
|
| 679 |
+
extra_instructions: str = "",
|
| 680 |
+
):
|
| 681 |
+
"""
|
| 682 |
+
Initialize extractor.
|
| 683 |
+
|
| 684 |
+
Args:
|
| 685 |
+
model_id: LLM model ID to use (default from config)
|
| 686 |
+
target_properties: List of property keys to extract
|
| 687 |
+
extra_instructions: Free-form LLM instructions
|
| 688 |
+
"""
|
| 689 |
+
config = get_config()
|
| 690 |
+
self.model_id = model_id or config.llm_model
|
| 691 |
+
self.openai_base_url = _normalize_base_url(config.openai_base_url)
|
| 692 |
+
self.openai_key = config.openai_api_key
|
| 693 |
+
self.target_properties = target_properties or _DEFAULT_PROPERTIES
|
| 694 |
+
self.extra_instructions = extra_instructions
|
| 695 |
+
|
| 696 |
+
def is_configured(self) -> bool:
|
| 697 |
+
return _is_http_url(self.openai_base_url)
|
| 698 |
+
|
| 699 |
+
def extract_from_paper(
|
| 700 |
+
self,
|
| 701 |
+
paper: PaperMetadata,
|
| 702 |
+
use_full_text: bool = True
|
| 703 |
+
) -> "ExtractionResult":
|
| 704 |
+
"""
|
| 705 |
+
Extract contextualized data from a paper.
|
| 706 |
+
|
| 707 |
+
Args:
|
| 708 |
+
paper: Paper metadata
|
| 709 |
+
use_full_text: Use PDF full text if available
|
| 710 |
+
|
| 711 |
+
Returns:
|
| 712 |
+
ExtractionResult with ContextualizedValue data points
|
| 713 |
+
"""
|
| 714 |
+
from .schemas import ContextualizedValue, ExperimentalConditions, ExtractionResult
|
| 715 |
+
|
| 716 |
+
logger.info(f"Contextualized extraction for: {paper.title[:50]}...")
|
| 717 |
+
|
| 718 |
+
if not self.is_configured():
|
| 719 |
+
return ExtractionResult(
|
| 720 |
+
paper_id=paper.id,
|
| 721 |
+
paper_title=paper.title,
|
| 722 |
+
success=False,
|
| 723 |
+
error_message="contextual_llm_unconfigured",
|
| 724 |
+
extraction_notes="Contextualized extraction skipped because no OpenAI-compatible base URL is configured.",
|
| 725 |
+
)
|
| 726 |
+
|
| 727 |
+
# Prepare content
|
| 728 |
+
content = paper.full_text if use_full_text and paper.full_text else paper.abstract
|
| 729 |
+
if not content:
|
| 730 |
+
return ExtractionResult(
|
| 731 |
+
paper_id=paper.id,
|
| 732 |
+
paper_title=paper.title,
|
| 733 |
+
success=False,
|
| 734 |
+
error_message="No content available"
|
| 735 |
+
)
|
| 736 |
+
|
| 737 |
+
# Truncate content to fit context window
|
| 738 |
+
content = content[:15000]
|
| 739 |
+
|
| 740 |
+
# Build dynamic prompt from target properties
|
| 741 |
+
prompt_template = build_extraction_prompt(self.target_properties, self.extra_instructions)
|
| 742 |
+
prompt = prompt_template.replace("{title}", paper.title or "Unknown").replace("{content}", content)
|
| 743 |
+
|
| 744 |
+
try:
|
| 745 |
+
# Call LLM
|
| 746 |
+
raw_response = self._call_llm(prompt)
|
| 747 |
+
|
| 748 |
+
if not raw_response:
|
| 749 |
+
return ExtractionResult(
|
| 750 |
+
paper_id=paper.id,
|
| 751 |
+
paper_title=paper.title,
|
| 752 |
+
success=False,
|
| 753 |
+
error_message="contextual_llm_unconfigured" if not self.is_configured() else "LLM returned empty response"
|
| 754 |
+
)
|
| 755 |
+
|
| 756 |
+
# Parse response
|
| 757 |
+
data_points = self._parse_response(raw_response, paper.id)
|
| 758 |
+
|
| 759 |
+
return ExtractionResult(
|
| 760 |
+
paper_id=paper.id,
|
| 761 |
+
paper_title=paper.title,
|
| 762 |
+
data_points=data_points,
|
| 763 |
+
extraction_model=self.model_id,
|
| 764 |
+
success=True
|
| 765 |
+
)
|
| 766 |
+
|
| 767 |
+
except Exception as e:
|
| 768 |
+
logger.warning(f"Contextualized extraction failed for {paper.id}: {e}")
|
| 769 |
+
return ExtractionResult(
|
| 770 |
+
paper_id=paper.id,
|
| 771 |
+
paper_title=paper.title,
|
| 772 |
+
success=False,
|
| 773 |
+
error_message=str(e)
|
| 774 |
+
)
|
| 775 |
+
|
| 776 |
+
def _call_llm(self, prompt: str) -> Optional[str]:
|
| 777 |
+
"""Call LLM via OpenAI-compatible API."""
|
| 778 |
+
import httpx
|
| 779 |
+
|
| 780 |
+
if not self.is_configured():
|
| 781 |
+
logger.debug("Contextualized extractor skipped: OpenAI-compatible base URL is not configured.")
|
| 782 |
+
return None
|
| 783 |
+
|
| 784 |
+
logger.info("Calling LLM for contextualized extraction...")
|
| 785 |
+
logger.info(f"Using model: {self.model_id}")
|
| 786 |
+
|
| 787 |
+
headers = {
|
| 788 |
+
"Content-Type": "application/json",
|
| 789 |
+
}
|
| 790 |
+
if self.openai_key:
|
| 791 |
+
headers["Authorization"] = f"Bearer {self.openai_key}"
|
| 792 |
+
|
| 793 |
+
payload = {
|
| 794 |
+
"model": self.model_id,
|
| 795 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 796 |
+
"temperature": 0.2,
|
| 797 |
+
"max_tokens": 3000,
|
| 798 |
+
}
|
| 799 |
+
|
| 800 |
+
with httpx.Client(timeout=120) as client:
|
| 801 |
+
response = client.post(
|
| 802 |
+
f"{self.openai_base_url}/chat/completions",
|
| 803 |
+
json=payload,
|
| 804 |
+
headers=headers
|
| 805 |
+
)
|
| 806 |
+
response.raise_for_status()
|
| 807 |
+
data = response.json()
|
| 808 |
+
|
| 809 |
+
return data["choices"][0]["message"]["content"]
|
| 810 |
+
|
| 811 |
+
def _parse_response(self, response: str, paper_id: str) -> List:
|
| 812 |
+
"""Parse LLM response into ContextualizedValue objects."""
|
| 813 |
+
from .schemas import ContextualizedValue, ExperimentalConditions
|
| 814 |
+
|
| 815 |
+
try:
|
| 816 |
+
data = safe_json_loads(response)
|
| 817 |
+
except Exception as e:
|
| 818 |
+
logger.warning(f"JSON parse failed for {paper_id}: {e}")
|
| 819 |
+
return []
|
| 820 |
+
|
| 821 |
+
if data is None:
|
| 822 |
+
return []
|
| 823 |
+
|
| 824 |
+
if not isinstance(data, list):
|
| 825 |
+
data = [data]
|
| 826 |
+
|
| 827 |
+
results = []
|
| 828 |
+
for item in data:
|
| 829 |
+
if not isinstance(item, dict):
|
| 830 |
+
continue
|
| 831 |
+
|
| 832 |
+
try:
|
| 833 |
+
# Handle conditions
|
| 834 |
+
conditions_data = item.pop("conditions", {})
|
| 835 |
+
conditions = ExperimentalConditions(**conditions_data) if conditions_data else ExperimentalConditions()
|
| 836 |
+
|
| 837 |
+
# Ensure required fields
|
| 838 |
+
if "source_quote" not in item or not item.get("source_quote"):
|
| 839 |
+
item["source_quote"] = f"[Extracted from {paper_id}]"
|
| 840 |
+
|
| 841 |
+
value = ContextualizedValue(
|
| 842 |
+
conditions=conditions,
|
| 843 |
+
**item
|
| 844 |
+
)
|
| 845 |
+
results.append(value)
|
| 846 |
+
except Exception as e:
|
| 847 |
+
logger.warning(f"Failed to parse data point: {e}")
|
| 848 |
+
continue
|
| 849 |
+
|
| 850 |
+
logger.info(f"Extracted {len(results)} contextualized data points from {paper_id}")
|
| 851 |
+
return results
|
| 852 |
+
|
| 853 |
+
def extract_from_papers(
|
| 854 |
+
self,
|
| 855 |
+
papers: List[PaperMetadata],
|
| 856 |
+
use_full_text: bool = True
|
| 857 |
+
) -> List:
|
| 858 |
+
"""Batch extraction from multiple papers."""
|
| 859 |
+
results = []
|
| 860 |
+
for paper in papers:
|
| 861 |
+
result = self.extract_from_paper(paper, use_full_text)
|
| 862 |
+
results.append(result)
|
| 863 |
+
return results
|
literature/graph.py
ADDED
|
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LangGraph workflow for Literature Discovery System.
|
| 3 |
+
Implements: discover → download → extract → quality pipeline.
|
| 4 |
+
|
| 5 |
+
Key design principles:
|
| 6 |
+
1. All state modifications must be explicit in return values
|
| 7 |
+
2. No in-place object modification
|
| 8 |
+
3. Each node returns logs for UI feedback
|
| 9 |
+
"""
|
| 10 |
+
import logging
|
| 11 |
+
from typing import TypedDict, List, Optional, Annotated, Literal, Callable, Any
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
import operator
|
| 14 |
+
|
| 15 |
+
from langgraph.graph import StateGraph, END, START
|
| 16 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 17 |
+
|
| 18 |
+
from .schemas import PaperMetadata, PolymerDataPoint, ExtractionResult, DataQuality
|
| 19 |
+
from .discovery import PaperDiscoveryAgent
|
| 20 |
+
from .extraction import DataExtractor
|
| 21 |
+
from .quality import QualityAssessor
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ============== State Definition ==============
|
| 27 |
+
|
| 28 |
+
class LogEntry(TypedDict):
|
| 29 |
+
"""Log entry for UI feedback"""
|
| 30 |
+
timestamp: str
|
| 31 |
+
node: str
|
| 32 |
+
message: str
|
| 33 |
+
level: str # info, warning, error
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class LiteratureState(TypedDict):
|
| 37 |
+
"""
|
| 38 |
+
Workflow state.
|
| 39 |
+
|
| 40 |
+
Important: LangGraph state updates are based on return values.
|
| 41 |
+
If you modify a field, you MUST include it in the return dict.
|
| 42 |
+
"""
|
| 43 |
+
# Input
|
| 44 |
+
search_query: str
|
| 45 |
+
max_papers: int
|
| 46 |
+
use_full_text: bool
|
| 47 |
+
|
| 48 |
+
# Progress tracking
|
| 49 |
+
current_node: str
|
| 50 |
+
progress_percent: int
|
| 51 |
+
|
| 52 |
+
# Intermediate results
|
| 53 |
+
papers: List[Any] # List[PaperMetadata] serialized
|
| 54 |
+
downloaded_pdfs: List[str]
|
| 55 |
+
extraction_results: List[Any] # List[ExtractionResult] serialized
|
| 56 |
+
|
| 57 |
+
# Final output
|
| 58 |
+
verified_data: List[Any] # List[PolymerDataPoint] serialized
|
| 59 |
+
quality_report: Optional[dict]
|
| 60 |
+
|
| 61 |
+
# Logging & Status
|
| 62 |
+
logs: Annotated[List[LogEntry], operator.add]
|
| 63 |
+
status: Literal["running", "completed", "failed", "cancelled"]
|
| 64 |
+
error: Optional[str]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def create_initial_state(
|
| 68 |
+
query: str,
|
| 69 |
+
max_papers: int = 10,
|
| 70 |
+
use_full_text: bool = False
|
| 71 |
+
) -> LiteratureState:
|
| 72 |
+
"""Create initial state"""
|
| 73 |
+
return {
|
| 74 |
+
"search_query": query,
|
| 75 |
+
"max_papers": max_papers,
|
| 76 |
+
"use_full_text": use_full_text,
|
| 77 |
+
"current_node": "start",
|
| 78 |
+
"progress_percent": 0,
|
| 79 |
+
"papers": [],
|
| 80 |
+
"downloaded_pdfs": [],
|
| 81 |
+
"extraction_results": [],
|
| 82 |
+
"verified_data": [],
|
| 83 |
+
"quality_report": None,
|
| 84 |
+
"logs": [],
|
| 85 |
+
"status": "running",
|
| 86 |
+
"error": None,
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# ============== Helper Functions ==============
|
| 91 |
+
|
| 92 |
+
def _log(node: str, message: str, level: str = "info") -> LogEntry:
|
| 93 |
+
"""Create log entry"""
|
| 94 |
+
return {
|
| 95 |
+
"timestamp": datetime.now().isoformat(),
|
| 96 |
+
"node": node,
|
| 97 |
+
"message": message,
|
| 98 |
+
"level": level,
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _serialize_paper(paper: PaperMetadata) -> dict:
|
| 103 |
+
"""Serialize paper for state storage"""
|
| 104 |
+
return paper.model_dump()
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _deserialize_paper(data: dict) -> PaperMetadata:
|
| 108 |
+
"""Deserialize paper from state"""
|
| 109 |
+
return PaperMetadata(**data)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ============== Node Functions ==============
|
| 113 |
+
|
| 114 |
+
def discover_node(state: LiteratureState) -> dict:
|
| 115 |
+
"""
|
| 116 |
+
Paper discovery node.
|
| 117 |
+
Uses existing PaperDiscoveryAgent (synchronous).
|
| 118 |
+
"""
|
| 119 |
+
node_name = "discover"
|
| 120 |
+
logs = [_log(node_name, f"Searching for: '{state['search_query']}'")]
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
agent = PaperDiscoveryAgent()
|
| 124 |
+
papers = agent.discover(
|
| 125 |
+
query=state["search_query"],
|
| 126 |
+
limit_per_source=state["max_papers"],
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
logs.append(_log(node_name, f"Found {len(papers)} unique papers"))
|
| 130 |
+
|
| 131 |
+
# Serialize papers for state storage
|
| 132 |
+
serialized_papers = [_serialize_paper(p) for p in papers]
|
| 133 |
+
|
| 134 |
+
return {
|
| 135 |
+
"papers": serialized_papers,
|
| 136 |
+
"current_node": node_name,
|
| 137 |
+
"progress_percent": 25,
|
| 138 |
+
"logs": logs,
|
| 139 |
+
}
|
| 140 |
+
except Exception as e:
|
| 141 |
+
logger.exception(f"Discover node failed: {e}")
|
| 142 |
+
logs.append(_log(node_name, f"Error: {e}", "error"))
|
| 143 |
+
return {
|
| 144 |
+
"papers": [],
|
| 145 |
+
"current_node": node_name,
|
| 146 |
+
"status": "failed",
|
| 147 |
+
"error": str(e),
|
| 148 |
+
"logs": logs,
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def download_node(state: LiteratureState) -> dict:
|
| 153 |
+
"""
|
| 154 |
+
PDF download node.
|
| 155 |
+
Uses existing PDFRetriever (synchronous).
|
| 156 |
+
"""
|
| 157 |
+
from .retrieval import PDFRetriever
|
| 158 |
+
|
| 159 |
+
node_name = "download"
|
| 160 |
+
paper_dicts = state["papers"]
|
| 161 |
+
logs = [_log(node_name, f"Downloading content for {len(paper_dicts)} papers")]
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
# Deserialize papers
|
| 165 |
+
papers = [_deserialize_paper(p) for p in paper_dicts]
|
| 166 |
+
|
| 167 |
+
retriever = PDFRetriever()
|
| 168 |
+
papers = retriever.retrieve_batch(papers)
|
| 169 |
+
|
| 170 |
+
# Count successes
|
| 171 |
+
downloaded = [p for p in papers if p.pdf_path]
|
| 172 |
+
logs.append(_log(node_name, f"Downloaded {len(downloaded)}/{len(papers)} PDFs"))
|
| 173 |
+
|
| 174 |
+
# Re-serialize papers with updated pdf_path
|
| 175 |
+
serialized_papers = [_serialize_paper(p) for p in papers]
|
| 176 |
+
downloaded_pdfs = [p.pdf_path for p in downloaded if p.pdf_path]
|
| 177 |
+
|
| 178 |
+
return {
|
| 179 |
+
"papers": serialized_papers,
|
| 180 |
+
"downloaded_pdfs": downloaded_pdfs,
|
| 181 |
+
"current_node": node_name,
|
| 182 |
+
"progress_percent": 50,
|
| 183 |
+
"logs": logs,
|
| 184 |
+
}
|
| 185 |
+
except Exception as e:
|
| 186 |
+
logger.exception(f"Download node failed: {e}")
|
| 187 |
+
logs.append(_log(node_name, f"Error: {e}", "error"))
|
| 188 |
+
return {
|
| 189 |
+
"downloaded_pdfs": [],
|
| 190 |
+
"current_node": node_name,
|
| 191 |
+
"status": "failed",
|
| 192 |
+
"error": str(e),
|
| 193 |
+
"logs": logs,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def extract_node(state: LiteratureState) -> dict:
|
| 198 |
+
"""
|
| 199 |
+
Data extraction node.
|
| 200 |
+
Uses existing DataExtractor (synchronous).
|
| 201 |
+
"""
|
| 202 |
+
node_name = "extract"
|
| 203 |
+
logs = [_log(node_name, "Extracting structured data from papers")]
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
# Deserialize papers
|
| 207 |
+
papers = [_deserialize_paper(p) for p in state["papers"]]
|
| 208 |
+
|
| 209 |
+
# Filter papers with content
|
| 210 |
+
papers_with_content = [p for p in papers if p.pdf_path or p.abstract]
|
| 211 |
+
|
| 212 |
+
if not papers_with_content:
|
| 213 |
+
logs.append(_log(node_name, "No papers with content to extract", "warning"))
|
| 214 |
+
return {
|
| 215 |
+
"extraction_results": [],
|
| 216 |
+
"current_node": node_name,
|
| 217 |
+
"progress_percent": 75,
|
| 218 |
+
"logs": logs,
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
logs.append(_log(node_name, f"Processing {len(papers_with_content)} papers with content"))
|
| 222 |
+
|
| 223 |
+
extractor = DataExtractor()
|
| 224 |
+
results = extractor.extract_from_papers(
|
| 225 |
+
papers_with_content,
|
| 226 |
+
use_full_text=state["use_full_text"]
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
total_points = sum(len(r.data_points) for r in results if r.success)
|
| 230 |
+
logs.append(_log(node_name, f"Extracted {total_points} data points from {len(results)} papers"))
|
| 231 |
+
|
| 232 |
+
# Serialize results
|
| 233 |
+
serialized_results = []
|
| 234 |
+
for r in results:
|
| 235 |
+
serialized_results.append({
|
| 236 |
+
"paper_id": r.paper.id if r.paper else "unknown",
|
| 237 |
+
"success": r.success,
|
| 238 |
+
"error_message": r.error_message,
|
| 239 |
+
"data_points": [dp.model_dump() for dp in r.data_points] if r.data_points else [],
|
| 240 |
+
})
|
| 241 |
+
|
| 242 |
+
return {
|
| 243 |
+
"extraction_results": serialized_results,
|
| 244 |
+
"current_node": node_name,
|
| 245 |
+
"progress_percent": 75,
|
| 246 |
+
"logs": logs,
|
| 247 |
+
}
|
| 248 |
+
except Exception as e:
|
| 249 |
+
logger.exception(f"Extract node failed: {e}")
|
| 250 |
+
logs.append(_log(node_name, f"Error: {e}", "error"))
|
| 251 |
+
return {
|
| 252 |
+
"extraction_results": [],
|
| 253 |
+
"current_node": node_name,
|
| 254 |
+
"status": "failed",
|
| 255 |
+
"error": str(e),
|
| 256 |
+
"logs": logs,
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def quality_node(state: LiteratureState) -> dict:
|
| 261 |
+
"""
|
| 262 |
+
Quality assessment node.
|
| 263 |
+
"""
|
| 264 |
+
node_name = "quality"
|
| 265 |
+
logs = [_log(node_name, "Assessing data quality")]
|
| 266 |
+
|
| 267 |
+
try:
|
| 268 |
+
# Collect all data points from serialized results
|
| 269 |
+
all_points: List[PolymerDataPoint] = []
|
| 270 |
+
for result_dict in state["extraction_results"]:
|
| 271 |
+
if result_dict.get("success") and result_dict.get("data_points"):
|
| 272 |
+
for dp_dict in result_dict["data_points"]:
|
| 273 |
+
try:
|
| 274 |
+
dp = PolymerDataPoint(**dp_dict)
|
| 275 |
+
all_points.append(dp)
|
| 276 |
+
except Exception as e:
|
| 277 |
+
logger.warning(f"Failed to deserialize data point: {e}")
|
| 278 |
+
|
| 279 |
+
if not all_points:
|
| 280 |
+
logs.append(_log(node_name, "No data points to assess", "warning"))
|
| 281 |
+
return {
|
| 282 |
+
"verified_data": [],
|
| 283 |
+
"quality_report": None,
|
| 284 |
+
"current_node": node_name,
|
| 285 |
+
"progress_percent": 100,
|
| 286 |
+
"status": "completed",
|
| 287 |
+
"logs": logs,
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
assessor = QualityAssessor()
|
| 291 |
+
verified, report = assessor.assess_batch(all_points)
|
| 292 |
+
|
| 293 |
+
logs.append(_log(node_name, report.summary()))
|
| 294 |
+
|
| 295 |
+
# Serialize
|
| 296 |
+
report_dict = {
|
| 297 |
+
"total_points": report.total_points,
|
| 298 |
+
"gold_count": report.gold_count,
|
| 299 |
+
"silver_count": report.silver_count,
|
| 300 |
+
"bronze_count": report.bronze_count,
|
| 301 |
+
"invalid_count": report.invalid_count,
|
| 302 |
+
"validation_errors": report.validation_errors,
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
verified_data = [dp.model_dump() for dp in verified]
|
| 306 |
+
|
| 307 |
+
return {
|
| 308 |
+
"verified_data": verified_data,
|
| 309 |
+
"quality_report": report_dict,
|
| 310 |
+
"current_node": node_name,
|
| 311 |
+
"progress_percent": 100,
|
| 312 |
+
"status": "completed",
|
| 313 |
+
"logs": logs,
|
| 314 |
+
}
|
| 315 |
+
except Exception as e:
|
| 316 |
+
logger.exception(f"Quality node failed: {e}")
|
| 317 |
+
logs.append(_log(node_name, f"Error: {e}", "error"))
|
| 318 |
+
return {
|
| 319 |
+
"verified_data": [],
|
| 320 |
+
"quality_report": None,
|
| 321 |
+
"current_node": node_name,
|
| 322 |
+
"status": "failed",
|
| 323 |
+
"error": str(e),
|
| 324 |
+
"logs": logs,
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
# ============== Conditional Edges ==============
|
| 329 |
+
|
| 330 |
+
def should_continue_after_discover(state: LiteratureState) -> str:
|
| 331 |
+
"""Should continue after discovery?"""
|
| 332 |
+
if state.get("status") == "failed":
|
| 333 |
+
return "end"
|
| 334 |
+
if not state.get("papers"):
|
| 335 |
+
return "end"
|
| 336 |
+
return "download"
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def should_continue_after_download(state: LiteratureState) -> str:
|
| 340 |
+
"""Should continue after download?"""
|
| 341 |
+
if state.get("status") == "failed":
|
| 342 |
+
return "end"
|
| 343 |
+
if not state.get("downloaded_pdfs") and not state.get("papers"):
|
| 344 |
+
return "end"
|
| 345 |
+
return "extract"
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def should_continue_after_extract(state: LiteratureState) -> str:
|
| 349 |
+
"""Should continue after extraction?"""
|
| 350 |
+
if state.get("status") == "failed":
|
| 351 |
+
return "end"
|
| 352 |
+
|
| 353 |
+
# Check if any extraction succeeded
|
| 354 |
+
results = state.get("extraction_results", [])
|
| 355 |
+
total_points = sum(
|
| 356 |
+
len(r.get("data_points", []))
|
| 357 |
+
for r in results
|
| 358 |
+
if r.get("success")
|
| 359 |
+
)
|
| 360 |
+
if total_points == 0:
|
| 361 |
+
return "end"
|
| 362 |
+
return "quality"
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
# ============== Graph Builder ==============
|
| 366 |
+
|
| 367 |
+
def create_literature_graph(checkpointer=None):
|
| 368 |
+
"""
|
| 369 |
+
Create the literature mining workflow graph.
|
| 370 |
+
|
| 371 |
+
Args:
|
| 372 |
+
checkpointer: Optional checkpoint storage (defaults to MemorySaver)
|
| 373 |
+
|
| 374 |
+
Returns:
|
| 375 |
+
Compiled LangGraph
|
| 376 |
+
"""
|
| 377 |
+
builder = StateGraph(LiteratureState)
|
| 378 |
+
|
| 379 |
+
# Add nodes
|
| 380 |
+
builder.add_node("discover", discover_node)
|
| 381 |
+
builder.add_node("download", download_node)
|
| 382 |
+
builder.add_node("extract", extract_node)
|
| 383 |
+
builder.add_node("quality", quality_node)
|
| 384 |
+
|
| 385 |
+
# Add edges
|
| 386 |
+
builder.add_edge(START, "discover")
|
| 387 |
+
|
| 388 |
+
builder.add_conditional_edges(
|
| 389 |
+
"discover",
|
| 390 |
+
should_continue_after_discover,
|
| 391 |
+
{"download": "download", "end": END}
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
builder.add_conditional_edges(
|
| 395 |
+
"download",
|
| 396 |
+
should_continue_after_download,
|
| 397 |
+
{"extract": "extract", "end": END}
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
builder.add_conditional_edges(
|
| 401 |
+
"extract",
|
| 402 |
+
should_continue_after_extract,
|
| 403 |
+
{"quality": "quality", "end": END}
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
builder.add_edge("quality", END)
|
| 407 |
+
|
| 408 |
+
# Compile
|
| 409 |
+
if checkpointer is None:
|
| 410 |
+
checkpointer = MemorySaver()
|
| 411 |
+
|
| 412 |
+
graph = builder.compile(checkpointer=checkpointer)
|
| 413 |
+
|
| 414 |
+
return graph
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
# ============== Sync Runner ==============
|
| 418 |
+
|
| 419 |
+
def run_workflow(
|
| 420 |
+
query: str,
|
| 421 |
+
max_papers: int = 10,
|
| 422 |
+
use_full_text: bool = False,
|
| 423 |
+
thread_id: str = "default",
|
| 424 |
+
on_state_update: Optional[Callable[[LiteratureState], None]] = None,
|
| 425 |
+
) -> LiteratureState:
|
| 426 |
+
"""
|
| 427 |
+
Run the literature mining workflow (synchronous).
|
| 428 |
+
|
| 429 |
+
Args:
|
| 430 |
+
query: Search query
|
| 431 |
+
max_papers: Max papers per source
|
| 432 |
+
use_full_text: Whether to use full text extraction
|
| 433 |
+
thread_id: Thread ID for state recovery
|
| 434 |
+
on_state_update: Callback for state updates
|
| 435 |
+
|
| 436 |
+
Returns:
|
| 437 |
+
Final state
|
| 438 |
+
"""
|
| 439 |
+
graph = create_literature_graph()
|
| 440 |
+
initial_state = create_initial_state(query, max_papers, use_full_text)
|
| 441 |
+
|
| 442 |
+
config = {"configurable": {"thread_id": thread_id}}
|
| 443 |
+
|
| 444 |
+
final_state = None
|
| 445 |
+
for event in graph.stream(initial_state, config, stream_mode="values"):
|
| 446 |
+
final_state = event
|
| 447 |
+
if on_state_update:
|
| 448 |
+
on_state_update(event)
|
| 449 |
+
|
| 450 |
+
return final_state
|
literature/property_registry.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Property catalog and extraction prompt builder for production literature mining.
|
| 3 |
+
|
| 4 |
+
This registry is aligned to the platform's public property keys so staged
|
| 5 |
+
literature evidence can be consumed by Property Probe and Discovery without
|
| 6 |
+
ad-hoc remapping.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
from typing import Dict, List, Optional
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
PROPERTY_CATALOG: Dict[str, Dict[str, str]] = {
|
| 15 |
+
# Thermal
|
| 16 |
+
"tm": {"name": "Melting temperature", "unit": "K"},
|
| 17 |
+
"tg": {"name": "Glass transition temperature", "unit": "K"},
|
| 18 |
+
"td": {"name": "Thermal diffusivity", "unit": "m^2/s"},
|
| 19 |
+
"tc": {"name": "Thermal conductivity", "unit": "W/(m*K)"},
|
| 20 |
+
"cp": {"name": "Specific heat capacity", "unit": "J/(kg*K)"},
|
| 21 |
+
# Mechanical
|
| 22 |
+
"young": {"name": "Young's modulus", "unit": "GPa"},
|
| 23 |
+
"shear": {"name": "Shear modulus", "unit": "GPa"},
|
| 24 |
+
"bulk": {"name": "Bulk modulus", "unit": "GPa"},
|
| 25 |
+
"poisson": {"name": "Poisson ratio", "unit": "dimensionless"},
|
| 26 |
+
# Transport
|
| 27 |
+
"visc": {"name": "Viscosity", "unit": "Pa*s"},
|
| 28 |
+
"dif": {"name": "Diffusivity", "unit": "cm^2/s"},
|
| 29 |
+
# Gas permeability
|
| 30 |
+
"phe": {"name": "He permeability", "unit": "Barrer"},
|
| 31 |
+
"ph2": {"name": "H2 permeability", "unit": "Barrer"},
|
| 32 |
+
"pco2": {"name": "CO2 permeability", "unit": "Barrer"},
|
| 33 |
+
"pn2": {"name": "N2 permeability", "unit": "Barrer"},
|
| 34 |
+
"po2": {"name": "O2 permeability", "unit": "Barrer"},
|
| 35 |
+
"pch4": {"name": "CH4 permeability", "unit": "Barrer"},
|
| 36 |
+
# Electronic / optical
|
| 37 |
+
"alpha": {"name": "Polarizability", "unit": "a.u."},
|
| 38 |
+
"homo": {"name": "HOMO energy", "unit": "eV"},
|
| 39 |
+
"lumo": {"name": "LUMO energy", "unit": "eV"},
|
| 40 |
+
"bandgap": {"name": "Band gap", "unit": "eV"},
|
| 41 |
+
"mu": {"name": "Dipole moment", "unit": "Debye"},
|
| 42 |
+
"etotal": {"name": "Total electronic energy", "unit": "eV"},
|
| 43 |
+
"ri": {"name": "Refractive index", "unit": "dimensionless"},
|
| 44 |
+
"dc": {"name": "Dielectric constant", "unit": "dimensionless"},
|
| 45 |
+
"pe": {"name": "Permittivity", "unit": "dimensionless"},
|
| 46 |
+
# Structural / physical
|
| 47 |
+
"rg": {"name": "Radius of gyration", "unit": "Angstrom"},
|
| 48 |
+
"rho": {"name": "Density", "unit": "g/cm^3"},
|
| 49 |
+
# Extended literature-only properties retained for discovery/search
|
| 50 |
+
"electrical_conductivity": {"name": "Electrical conductivity", "unit": "S/cm"},
|
| 51 |
+
"seebeck_coefficient": {"name": "Seebeck coefficient", "unit": "uV/K"},
|
| 52 |
+
"power_factor": {"name": "Power factor", "unit": "uW/(m*K^2)"},
|
| 53 |
+
"zt_figure_of_merit": {"name": "ZT figure of merit", "unit": "dimensionless"},
|
| 54 |
+
"tensile_strength": {"name": "Tensile strength", "unit": "MPa"},
|
| 55 |
+
"elongation_at_break": {"name": "Elongation at break", "unit": "%"},
|
| 56 |
+
"crystallinity": {"name": "Crystallinity", "unit": "%"},
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
PLATFORM_PROPERTY_KEYS = [
|
| 61 |
+
"tm", "tg", "td", "tc", "cp",
|
| 62 |
+
"young", "shear", "bulk", "poisson",
|
| 63 |
+
"visc", "dif",
|
| 64 |
+
"phe", "ph2", "pco2", "pn2", "po2", "pch4",
|
| 65 |
+
"alpha", "homo", "lumo", "bandgap", "mu", "etotal", "ri", "dc", "pe",
|
| 66 |
+
"rg", "rho",
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
TEMPLATES: Dict[str, List[str]] = {
|
| 71 |
+
"thermal": ["tm", "tg", "td", "tc", "cp"],
|
| 72 |
+
"mechanical": ["young", "shear", "bulk", "poisson", "tensile_strength", "elongation_at_break"],
|
| 73 |
+
"electronic": ["bandgap", "homo", "lumo", "ri", "dc", "pe", "alpha", "mu", "etotal"],
|
| 74 |
+
"gas_permeability": ["pco2", "po2", "pn2", "ph2", "phe", "pch4"],
|
| 75 |
+
"transport": ["visc", "dif", "tc", "electrical_conductivity", "seebeck_coefficient", "power_factor"],
|
| 76 |
+
"platform_core": PLATFORM_PROPERTY_KEYS,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
TEMPLATE_LABELS: Dict[str, str] = {
|
| 80 |
+
"thermal": "Thermal",
|
| 81 |
+
"mechanical": "Mechanical",
|
| 82 |
+
"electronic": "Electronic / Optical",
|
| 83 |
+
"gas_permeability": "Gas Permeability",
|
| 84 |
+
"transport": "Transport / Energy",
|
| 85 |
+
"platform_core": "Platform Core",
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
PROPERTY_ALIASES: Dict[str, str] = {
|
| 90 |
+
"thermal conductivity": "tc",
|
| 91 |
+
"heat conductivity": "tc",
|
| 92 |
+
"thermal diffusivity": "td",
|
| 93 |
+
"heat diffusivity": "td",
|
| 94 |
+
"specific heat": "cp",
|
| 95 |
+
"heat capacity": "cp",
|
| 96 |
+
"young modulus": "young",
|
| 97 |
+
"youngs modulus": "young",
|
| 98 |
+
"young_s_modulus": "young",
|
| 99 |
+
"young_modulus": "young",
|
| 100 |
+
"shear modulus": "shear",
|
| 101 |
+
"shear_modulus": "shear",
|
| 102 |
+
"bulk modulus": "bulk",
|
| 103 |
+
"bulk_modulus": "bulk",
|
| 104 |
+
"poisson ratio": "poisson",
|
| 105 |
+
"poisson_ratio": "poisson",
|
| 106 |
+
"viscosity": "visc",
|
| 107 |
+
"diffusivity": "dif",
|
| 108 |
+
"he permeability": "phe",
|
| 109 |
+
"helium permeability": "phe",
|
| 110 |
+
"h2 permeability": "ph2",
|
| 111 |
+
"co2 permeability": "pco2",
|
| 112 |
+
"n2 permeability": "pn2",
|
| 113 |
+
"o2 permeability": "po2",
|
| 114 |
+
"ch4 permeability": "pch4",
|
| 115 |
+
"polarizability": "alpha",
|
| 116 |
+
"homo energy": "homo",
|
| 117 |
+
"lumo energy": "lumo",
|
| 118 |
+
"band gap": "bandgap",
|
| 119 |
+
"bandgap": "bandgap",
|
| 120 |
+
"dipole moment": "mu",
|
| 121 |
+
"total electronic energy": "etotal",
|
| 122 |
+
"refractive index": "ri",
|
| 123 |
+
"dielectric constant": "dc",
|
| 124 |
+
"permittivity": "pe",
|
| 125 |
+
"radius of gyration": "rg",
|
| 126 |
+
"density": "rho",
|
| 127 |
+
"electrical conductivity": "electrical_conductivity",
|
| 128 |
+
"conductivity": "electrical_conductivity",
|
| 129 |
+
"seebeck coefficient": "seebeck_coefficient",
|
| 130 |
+
"power factor": "power_factor",
|
| 131 |
+
"zt": "zt_figure_of_merit",
|
| 132 |
+
"zt figure of merit": "zt_figure_of_merit",
|
| 133 |
+
"tensile strength": "tensile_strength",
|
| 134 |
+
"elongation at break": "elongation_at_break",
|
| 135 |
+
"co2_permeability": "pco2",
|
| 136 |
+
"o2_permeability": "po2",
|
| 137 |
+
"n2_permeability": "pn2",
|
| 138 |
+
"h2_permeability": "ph2",
|
| 139 |
+
"he_permeability": "phe",
|
| 140 |
+
"ch4_permeability": "pch4",
|
| 141 |
+
"radius_of_gyration": "rg",
|
| 142 |
+
"refractive_index": "ri",
|
| 143 |
+
"dielectric_constant": "dc",
|
| 144 |
+
"dipole_moment": "mu",
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def _norm(text: str) -> str:
|
| 149 |
+
normalized = re.sub(r"[^a-z0-9]+", " ", str(text or "").strip().lower())
|
| 150 |
+
return re.sub(r"\s+", " ", normalized).strip()
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
for key, meta in PROPERTY_CATALOG.items():
|
| 154 |
+
PROPERTY_ALIASES.setdefault(_norm(key), key)
|
| 155 |
+
PROPERTY_ALIASES.setdefault(_norm(meta["name"]), key)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def normalize_property_key(value: str | None) -> Optional[str]:
|
| 159 |
+
"""Map free-form property text to a canonical registry key."""
|
| 160 |
+
if not value:
|
| 161 |
+
return None
|
| 162 |
+
key = PROPERTY_ALIASES.get(_norm(value))
|
| 163 |
+
if key in PROPERTY_CATALOG:
|
| 164 |
+
return key
|
| 165 |
+
return None
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def detect_property_keys(text: str) -> List[str]:
|
| 169 |
+
"""Return all unique property keys that appear in the free-form text."""
|
| 170 |
+
haystack = _norm(text)
|
| 171 |
+
out: List[str] = []
|
| 172 |
+
for alias, key in PROPERTY_ALIASES.items():
|
| 173 |
+
if alias and alias in haystack and key not in out:
|
| 174 |
+
out.append(key)
|
| 175 |
+
return out
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def property_display_name(key: str) -> str:
|
| 179 |
+
meta = PROPERTY_CATALOG.get(key)
|
| 180 |
+
if not meta:
|
| 181 |
+
return key
|
| 182 |
+
return f"{meta['name']} ({meta['unit']})"
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _property_list_block(property_keys: List[str]) -> str:
|
| 186 |
+
"""Build the target-properties section of the extraction prompt."""
|
| 187 |
+
lines = []
|
| 188 |
+
for key in property_keys:
|
| 189 |
+
meta = PROPERTY_CATALOG.get(key)
|
| 190 |
+
if meta:
|
| 191 |
+
lines.append(f"- `{key}` ({meta['name']}) -- standard unit: {meta['unit']}")
|
| 192 |
+
else:
|
| 193 |
+
lines.append(f"- `{key}`")
|
| 194 |
+
return "\n".join(lines)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def build_extraction_prompt(
|
| 198 |
+
property_keys: List[str],
|
| 199 |
+
extra_instructions: str = "",
|
| 200 |
+
) -> str:
|
| 201 |
+
"""
|
| 202 |
+
Build a dynamic contextualized extraction prompt from the given property list.
|
| 203 |
+
"""
|
| 204 |
+
normalized_keys = [normalize_property_key(k) or k for k in property_keys if k]
|
| 205 |
+
props_block = _property_list_block(normalized_keys)
|
| 206 |
+
|
| 207 |
+
extra_section = ""
|
| 208 |
+
if extra_instructions.strip():
|
| 209 |
+
extra_section = f"""
|
| 210 |
+
## ADDITIONAL CONTEXT
|
| 211 |
+
|
| 212 |
+
{extra_instructions.strip()}
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
prompt = f"""You are an expert in polymer science and materials characterization.
|
| 216 |
+
Extract experimentally grounded evidence records from the provided paper.
|
| 217 |
+
|
| 218 |
+
## CRITICAL REQUIREMENTS
|
| 219 |
+
|
| 220 |
+
1. Extract each material-property-value observation as a separate record
|
| 221 |
+
2. Preserve the original value and unit exactly as written
|
| 222 |
+
3. Include experimental conditions and measurement method whenever available
|
| 223 |
+
4. Include a source quote and source location for every record
|
| 224 |
+
5. Ignore theoretical-only values unless the paper explicitly reports an experiment-backed measurement
|
| 225 |
+
|
| 226 |
+
## TARGET PROPERTIES
|
| 227 |
+
|
| 228 |
+
For each data point, extract these properties:
|
| 229 |
+
{props_block}
|
| 230 |
+
{extra_section}
|
| 231 |
+
## OUTPUT FORMAT (JSON Array)
|
| 232 |
+
|
| 233 |
+
Return ONLY valid JSON, no markdown, no explanation:
|
| 234 |
+
|
| 235 |
+
[
|
| 236 |
+
{{
|
| 237 |
+
"polymer_name": "P3HT",
|
| 238 |
+
"property_name": "<one of the target property keys above>",
|
| 239 |
+
"raw_value": "1.9",
|
| 240 |
+
"raw_unit": "eV",
|
| 241 |
+
"conditions": {{
|
| 242 |
+
"solvent": "chloroform",
|
| 243 |
+
"annealing_temp_c": 150,
|
| 244 |
+
"annealing_time_min": 10,
|
| 245 |
+
"measurement_temp_k": 300,
|
| 246 |
+
"measurement_method": "UV-Vis"
|
| 247 |
+
}},
|
| 248 |
+
"source_quote": "The optical band gap of P3HT was determined to be 1.9 eV from the UV-Vis absorption onset.",
|
| 249 |
+
"source_location": "Table 1",
|
| 250 |
+
"extraction_confidence": 0.95
|
| 251 |
+
}}
|
| 252 |
+
]
|
| 253 |
+
|
| 254 |
+
## RULES
|
| 255 |
+
|
| 256 |
+
1. If values range "from X to Y", extract BOTH as separate points
|
| 257 |
+
2. Preserve scientific notation as "5.2e3" or actual number
|
| 258 |
+
3. If no source quote is available, lower extraction_confidence below 0.5
|
| 259 |
+
4. Prefer experimentally measured values over model predictions or simulations
|
| 260 |
+
5. Return ONLY a valid JSON array, no extra text
|
| 261 |
+
|
| 262 |
+
---
|
| 263 |
+
|
| 264 |
+
**PAPER CONTENT:**
|
| 265 |
+
|
| 266 |
+
Title: {{title}}
|
| 267 |
+
|
| 268 |
+
{{content}}
|
| 269 |
+
|
| 270 |
+
---
|
| 271 |
+
|
| 272 |
+
JSON output:
|
| 273 |
+
"""
|
| 274 |
+
return prompt
|
literature/quality.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Production quality assessment and validation for literature evidence.
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 9 |
+
|
| 10 |
+
from .schemas import ContextualizedValue, DataQuality, PolymerDataPoint
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class QualityReport:
|
| 17 |
+
"""Data quality report for a batch of data points."""
|
| 18 |
+
total_points: int
|
| 19 |
+
gold_count: int
|
| 20 |
+
silver_count: int
|
| 21 |
+
bronze_count: int
|
| 22 |
+
invalid_count: int
|
| 23 |
+
validation_errors: List[str]
|
| 24 |
+
|
| 25 |
+
@property
|
| 26 |
+
def gold_ratio(self) -> float:
|
| 27 |
+
return self.gold_count / max(self.total_points, 1)
|
| 28 |
+
|
| 29 |
+
def summary(self) -> str:
|
| 30 |
+
return (
|
| 31 |
+
f"Quality Report: {self.total_points} points\n"
|
| 32 |
+
f" Gold: {self.gold_count} ({self.gold_ratio:.1%})\n"
|
| 33 |
+
f" Silver: {self.silver_count}\n"
|
| 34 |
+
f" Bronze: {self.bronze_count}\n"
|
| 35 |
+
f" Invalid: {self.invalid_count}\n"
|
| 36 |
+
f" Errors: {len(self.validation_errors)}"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class QualityAssessor:
|
| 41 |
+
"""Quality assessor with property-aware sanity checks."""
|
| 42 |
+
|
| 43 |
+
PROPERTY_BOUNDS: Dict[str, Tuple[Optional[float], Optional[float]]] = {
|
| 44 |
+
"tm": (50, 2000),
|
| 45 |
+
"tg": (50, 2000),
|
| 46 |
+
"td": (1e-10, 1.0),
|
| 47 |
+
"tc": (1e-4, 1000.0),
|
| 48 |
+
"cp": (1.0, 1e7),
|
| 49 |
+
"young": (1e-6, 1e5),
|
| 50 |
+
"shear": (1e-6, 1e5),
|
| 51 |
+
"bulk": (1e-6, 1e5),
|
| 52 |
+
"poisson": (-1.0, 0.5),
|
| 53 |
+
"visc": (1e-9, 1e9),
|
| 54 |
+
"dif": (1e-12, 10.0),
|
| 55 |
+
"rho": (1e-6, 100.0),
|
| 56 |
+
"ri": (0.5, 10.0),
|
| 57 |
+
"bandgap": (-20.0, 20.0),
|
| 58 |
+
"homo": (-30.0, 10.0),
|
| 59 |
+
"lumo": (-30.0, 20.0),
|
| 60 |
+
"mu": (0.0, 1e4),
|
| 61 |
+
"electrical_conductivity": (1e-12, 1e8),
|
| 62 |
+
"seebeck_coefficient": (-1e5, 1e5),
|
| 63 |
+
"power_factor": (0.0, 1e9),
|
| 64 |
+
"zt_figure_of_merit": (0.0, 1e4),
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
def __init__(self) -> None:
|
| 68 |
+
self.errors: List[str] = []
|
| 69 |
+
|
| 70 |
+
def assess_batch(self, data_points: List[PolymerDataPoint]) -> Tuple[List[PolymerDataPoint], QualityReport]:
|
| 71 |
+
"""Legacy compatibility path used by older scripts."""
|
| 72 |
+
self.errors = []
|
| 73 |
+
valid_points: List[PolymerDataPoint] = []
|
| 74 |
+
gold_count = silver_count = bronze_count = invalid_count = 0
|
| 75 |
+
|
| 76 |
+
for dp in data_points:
|
| 77 |
+
is_valid, error_msg = self._validate_legacy(dp)
|
| 78 |
+
if not is_valid:
|
| 79 |
+
self.errors.append(f"{dp.source_paper_id}: {error_msg}")
|
| 80 |
+
invalid_count += 1
|
| 81 |
+
continue
|
| 82 |
+
|
| 83 |
+
dp.quality_tier = self._compute_legacy_quality_tier(dp)
|
| 84 |
+
if dp.quality_tier == DataQuality.GOLD:
|
| 85 |
+
gold_count += 1
|
| 86 |
+
elif dp.quality_tier == DataQuality.SILVER:
|
| 87 |
+
silver_count += 1
|
| 88 |
+
else:
|
| 89 |
+
bronze_count += 1
|
| 90 |
+
valid_points.append(dp)
|
| 91 |
+
|
| 92 |
+
report = QualityReport(
|
| 93 |
+
total_points=len(data_points),
|
| 94 |
+
gold_count=gold_count,
|
| 95 |
+
silver_count=silver_count,
|
| 96 |
+
bronze_count=bronze_count,
|
| 97 |
+
invalid_count=invalid_count,
|
| 98 |
+
validation_errors=self.errors.copy(),
|
| 99 |
+
)
|
| 100 |
+
logger.info(report.summary())
|
| 101 |
+
return valid_points, report
|
| 102 |
+
|
| 103 |
+
def validate_contextual_value(self, value: ContextualizedValue) -> Tuple[bool, Optional[str]]:
|
| 104 |
+
if not value.polymer_name or value.polymer_name.strip().lower() == "unknown":
|
| 105 |
+
return False, "Missing material name"
|
| 106 |
+
if not value.property_name:
|
| 107 |
+
return False, "Missing property key"
|
| 108 |
+
if value.standardized_value is None:
|
| 109 |
+
return False, "Missing standardized value"
|
| 110 |
+
if not value.source_quote or len(value.source_quote.strip()) < 10:
|
| 111 |
+
return False, "Missing source quote"
|
| 112 |
+
|
| 113 |
+
bounds = self.PROPERTY_BOUNDS.get(value.property_name)
|
| 114 |
+
if bounds is None:
|
| 115 |
+
return True, None
|
| 116 |
+
|
| 117 |
+
low, high = bounds
|
| 118 |
+
numeric = value.standardized_value
|
| 119 |
+
if low is not None and numeric < low:
|
| 120 |
+
return False, f"Value below plausible range: {numeric}"
|
| 121 |
+
if high is not None and numeric > high:
|
| 122 |
+
return False, f"Value above plausible range: {numeric}"
|
| 123 |
+
return True, None
|
| 124 |
+
|
| 125 |
+
def assess_contextual_quality(self, value: ContextualizedValue) -> DataQuality:
|
| 126 |
+
score = 0
|
| 127 |
+
if value.standardized_value is not None:
|
| 128 |
+
score += 2
|
| 129 |
+
if value.conditions.to_dict():
|
| 130 |
+
score += min(len(value.conditions.to_dict()), 3)
|
| 131 |
+
if value.conditions.measurement_method:
|
| 132 |
+
score += 1
|
| 133 |
+
if value.source_location:
|
| 134 |
+
score += 1
|
| 135 |
+
if value.extraction_confidence >= 0.9:
|
| 136 |
+
score += 2
|
| 137 |
+
elif value.extraction_confidence >= 0.7:
|
| 138 |
+
score += 1
|
| 139 |
+
|
| 140 |
+
if score >= 7:
|
| 141 |
+
return DataQuality.GOLD
|
| 142 |
+
if score >= 4:
|
| 143 |
+
return DataQuality.SILVER
|
| 144 |
+
return DataQuality.BRONZE
|
| 145 |
+
|
| 146 |
+
def _validate_legacy(self, dp: PolymerDataPoint) -> Tuple[bool, Optional[str]]:
|
| 147 |
+
if not dp.polymer_name or dp.polymer_name == "Unknown":
|
| 148 |
+
return False, "Missing polymer name"
|
| 149 |
+
has_measurement = any([
|
| 150 |
+
dp.electrical_conductivity_s_cm is not None,
|
| 151 |
+
dp.thermal_conductivity_w_mk is not None,
|
| 152 |
+
dp.seebeck_coefficient_uv_k is not None,
|
| 153 |
+
])
|
| 154 |
+
if not has_measurement:
|
| 155 |
+
return False, "No measurement values"
|
| 156 |
+
return True, None
|
| 157 |
+
|
| 158 |
+
def _compute_legacy_quality_tier(self, dp: PolymerDataPoint) -> DataQuality:
|
| 159 |
+
score = 0
|
| 160 |
+
if dp.electrical_conductivity_s_cm is not None:
|
| 161 |
+
score += 3
|
| 162 |
+
if dp.seebeck_coefficient_uv_k is not None:
|
| 163 |
+
score += 2
|
| 164 |
+
if dp.power_factor_uw_m_k2 is not None:
|
| 165 |
+
score += 1
|
| 166 |
+
if dp.thermal_conductivity_w_mk is not None:
|
| 167 |
+
score += 4
|
| 168 |
+
if dp.source_table_or_figure:
|
| 169 |
+
score += 1
|
| 170 |
+
if dp.annealing_temp_c is not None:
|
| 171 |
+
score += 1
|
| 172 |
+
if score >= 7:
|
| 173 |
+
return DataQuality.GOLD
|
| 174 |
+
if score >= 4:
|
| 175 |
+
return DataQuality.SILVER
|
| 176 |
+
return DataQuality.BRONZE
|
literature/retrieval.py
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF retrieval module.
|
| 3 |
+
Downloads papers from ArXiv (priority) and via Unpaywall.
|
| 4 |
+
Implements robust header spoofing and graceful error handling.
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Optional, List
|
| 11 |
+
import requests
|
| 12 |
+
|
| 13 |
+
from .schemas import PaperMetadata, PaperSource
|
| 14 |
+
from .config import get_config
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class PDFRetriever:
|
| 20 |
+
"""
|
| 21 |
+
PDF retrieval with robust error handling.
|
| 22 |
+
|
| 23 |
+
Priority:
|
| 24 |
+
1. ArXiv (direct, free, reliable)
|
| 25 |
+
2. Existing pdf_url from metadata
|
| 26 |
+
3. Unpaywall via DOI
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self) -> None:
|
| 30 |
+
config = get_config()
|
| 31 |
+
self.storage_dir = Path(config.pdf_storage_dir)
|
| 32 |
+
self.storage_dir.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
# Robust headers to avoid 403
|
| 35 |
+
self.headers = {
|
| 36 |
+
"User-Agent": config.user_agent,
|
| 37 |
+
"Accept": "application/pdf,*/*",
|
| 38 |
+
"Accept-Language": "en-US,en;q=0.9",
|
| 39 |
+
"Accept-Encoding": "gzip, deflate, br",
|
| 40 |
+
"Connection": "keep-alive",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
self.timeout = 60 # seconds
|
| 44 |
+
self.unpaywall_email = config.pubmed_email
|
| 45 |
+
|
| 46 |
+
def retrieve_batch(
|
| 47 |
+
self,
|
| 48 |
+
papers: List[PaperMetadata],
|
| 49 |
+
skip_existing: bool = True
|
| 50 |
+
) -> List[PaperMetadata]:
|
| 51 |
+
"""
|
| 52 |
+
Download PDFs for a batch of papers.
|
| 53 |
+
Updates paper.pdf_path for successful downloads.
|
| 54 |
+
Saves all papers and failed downloads to CSVs.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
papers: List of paper metadata
|
| 58 |
+
skip_existing: Skip if PDF already exists
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
Updated list of papers with pdf_path set where successful
|
| 62 |
+
"""
|
| 63 |
+
successful_ids: set = set()
|
| 64 |
+
failed_papers: List[PaperMetadata] = []
|
| 65 |
+
|
| 66 |
+
for paper in papers:
|
| 67 |
+
try:
|
| 68 |
+
pdf_path = self.retrieve_single(paper, skip_existing=skip_existing)
|
| 69 |
+
if pdf_path:
|
| 70 |
+
paper.pdf_path = pdf_path
|
| 71 |
+
successful_ids.add(paper.id)
|
| 72 |
+
else:
|
| 73 |
+
failed_papers.append(paper)
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.warning(f"PDF retrieval failed for {paper.id}: {e}")
|
| 76 |
+
failed_papers.append(paper)
|
| 77 |
+
|
| 78 |
+
logger.info(f"PDF retrieval complete: {len(successful_ids)} successful, {len(failed_papers)} failed")
|
| 79 |
+
|
| 80 |
+
# Save all papers with download status
|
| 81 |
+
self._save_all_papers(papers, successful_ids)
|
| 82 |
+
|
| 83 |
+
# Save failed downloads for manual retrieval
|
| 84 |
+
if failed_papers:
|
| 85 |
+
self._save_failed_downloads(failed_papers)
|
| 86 |
+
|
| 87 |
+
return papers
|
| 88 |
+
|
| 89 |
+
def _save_failed_downloads(self, papers: List[PaperMetadata]) -> None:
|
| 90 |
+
"""Save failed downloads to CSV for manual retrieval."""
|
| 91 |
+
import csv
|
| 92 |
+
from datetime import datetime
|
| 93 |
+
|
| 94 |
+
csv_path = self.storage_dir / "failed_downloads.csv"
|
| 95 |
+
file_exists = csv_path.exists()
|
| 96 |
+
|
| 97 |
+
with open(csv_path, "a", newline="", encoding="utf-8") as f:
|
| 98 |
+
writer = csv.writer(f)
|
| 99 |
+
|
| 100 |
+
# Write header if new file
|
| 101 |
+
if not file_exists:
|
| 102 |
+
writer.writerow([
|
| 103 |
+
"timestamp", "paper_id", "title", "source", "doi", "url", "expected_filename"
|
| 104 |
+
])
|
| 105 |
+
|
| 106 |
+
timestamp = datetime.now().isoformat()
|
| 107 |
+
for paper in papers:
|
| 108 |
+
safe_id = paper.id.replace("/", "_").replace(":", "_")
|
| 109 |
+
expected_filename = f"{safe_id}.pdf"
|
| 110 |
+
writer.writerow([
|
| 111 |
+
timestamp,
|
| 112 |
+
paper.id,
|
| 113 |
+
paper.title[:100], # Truncate long titles
|
| 114 |
+
paper.source.value,
|
| 115 |
+
paper.doi or "",
|
| 116 |
+
paper.url or "",
|
| 117 |
+
expected_filename
|
| 118 |
+
])
|
| 119 |
+
|
| 120 |
+
logger.info(f"Saved {len(papers)} failed downloads to {csv_path}")
|
| 121 |
+
|
| 122 |
+
def _save_all_papers(
|
| 123 |
+
self,
|
| 124 |
+
papers: List[PaperMetadata],
|
| 125 |
+
successful_ids: set
|
| 126 |
+
) -> None:
|
| 127 |
+
"""Save all discovered papers to CSV with download status."""
|
| 128 |
+
import csv
|
| 129 |
+
from datetime import datetime
|
| 130 |
+
|
| 131 |
+
csv_path = self.storage_dir / "all_papers.csv"
|
| 132 |
+
|
| 133 |
+
with open(csv_path, "w", newline="", encoding="utf-8") as f:
|
| 134 |
+
writer = csv.writer(f)
|
| 135 |
+
writer.writerow([
|
| 136 |
+
"paper_id", "title", "source", "year", "doi", "url",
|
| 137 |
+
"pdf_downloaded", "pdf_path", "timestamp"
|
| 138 |
+
])
|
| 139 |
+
|
| 140 |
+
timestamp = datetime.now().isoformat()
|
| 141 |
+
for paper in papers:
|
| 142 |
+
downloaded = paper.id in successful_ids or paper.pdf_path is not None
|
| 143 |
+
writer.writerow([
|
| 144 |
+
paper.id,
|
| 145 |
+
paper.title[:150],
|
| 146 |
+
paper.source.value,
|
| 147 |
+
paper.year or "",
|
| 148 |
+
paper.doi or "",
|
| 149 |
+
paper.url or "",
|
| 150 |
+
"YES" if downloaded else "NO",
|
| 151 |
+
paper.pdf_path or "",
|
| 152 |
+
timestamp
|
| 153 |
+
])
|
| 154 |
+
|
| 155 |
+
logger.info(f"Saved {len(papers)} papers to {csv_path}")
|
| 156 |
+
|
| 157 |
+
def retrieve_single(
|
| 158 |
+
self,
|
| 159 |
+
paper: PaperMetadata,
|
| 160 |
+
skip_existing: bool = True
|
| 161 |
+
) -> Optional[str]:
|
| 162 |
+
"""
|
| 163 |
+
Download PDF for a single paper.
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
paper: Paper metadata
|
| 167 |
+
skip_existing: Skip if file already exists
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
Path to downloaded PDF, or None if failed
|
| 171 |
+
"""
|
| 172 |
+
# Determine filename
|
| 173 |
+
safe_id = paper.id.replace("/", "_").replace(":", "_")
|
| 174 |
+
pdf_filename = f"{safe_id}.pdf"
|
| 175 |
+
pdf_path = self.storage_dir / pdf_filename
|
| 176 |
+
|
| 177 |
+
# Check if already exists
|
| 178 |
+
if skip_existing and pdf_path.exists():
|
| 179 |
+
logger.debug(f"PDF already exists: {pdf_path}")
|
| 180 |
+
return str(pdf_path)
|
| 181 |
+
|
| 182 |
+
# Try download methods in priority order
|
| 183 |
+
pdf_url = self._get_pdf_url(paper)
|
| 184 |
+
|
| 185 |
+
if pdf_url:
|
| 186 |
+
success = self._download_pdf(pdf_url, pdf_path)
|
| 187 |
+
if success:
|
| 188 |
+
logger.info(f"Downloaded PDF: {pdf_path}")
|
| 189 |
+
return str(pdf_path)
|
| 190 |
+
|
| 191 |
+
logger.warning(f"Could not download PDF for {paper.id}")
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
def _get_pdf_url(self, paper: PaperMetadata) -> Optional[str]:
|
| 195 |
+
"""
|
| 196 |
+
Get PDF URL using priority order:
|
| 197 |
+
1. ArXiv direct link
|
| 198 |
+
2. PubMed Central (PMC) for PubMed papers
|
| 199 |
+
3. Existing pdf_url from metadata
|
| 200 |
+
4. Unpaywall via DOI
|
| 201 |
+
"""
|
| 202 |
+
# Priority 1: ArXiv (most reliable, free)
|
| 203 |
+
if paper.source == PaperSource.ARXIV:
|
| 204 |
+
arxiv_id = paper.id.replace("arxiv_", "")
|
| 205 |
+
return f"https://arxiv.org/pdf/{arxiv_id}.pdf"
|
| 206 |
+
|
| 207 |
+
# Priority 2: PubMed - try PMC first
|
| 208 |
+
if paper.source == PaperSource.PUBMED:
|
| 209 |
+
pmc_url = self._get_pmc_pdf_url(paper)
|
| 210 |
+
if pmc_url:
|
| 211 |
+
return pmc_url
|
| 212 |
+
|
| 213 |
+
# Priority 3: Use existing pdf_url if available
|
| 214 |
+
if paper.pdf_url:
|
| 215 |
+
return paper.pdf_url
|
| 216 |
+
|
| 217 |
+
# Priority 4: Try Unpaywall via DOI (works for all sources)
|
| 218 |
+
if paper.doi:
|
| 219 |
+
unpaywall_url = self._get_unpaywall_url(paper.doi)
|
| 220 |
+
if unpaywall_url:
|
| 221 |
+
return unpaywall_url
|
| 222 |
+
|
| 223 |
+
return None
|
| 224 |
+
|
| 225 |
+
def _get_pmc_pdf_url(self, paper: PaperMetadata) -> Optional[str]:
|
| 226 |
+
"""
|
| 227 |
+
Try to get PDF from PubMed Central (PMC).
|
| 228 |
+
PMC provides free full-text PDFs for many PubMed articles.
|
| 229 |
+
"""
|
| 230 |
+
try:
|
| 231 |
+
pmid = paper.id.replace("pubmed_", "")
|
| 232 |
+
|
| 233 |
+
# Try elink to get PMC ID
|
| 234 |
+
from Bio import Entrez
|
| 235 |
+
handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid)
|
| 236 |
+
record = Entrez.read(handle)
|
| 237 |
+
handle.close()
|
| 238 |
+
|
| 239 |
+
# Check if PMC ID exists
|
| 240 |
+
link_sets = record[0].get("LinkSetDb", [])
|
| 241 |
+
for link_set in link_sets:
|
| 242 |
+
if link_set.get("DbTo") == "pmc":
|
| 243 |
+
links = link_set.get("Link", [])
|
| 244 |
+
if links:
|
| 245 |
+
pmc_id = links[0]["Id"]
|
| 246 |
+
# PMC PDF URL format
|
| 247 |
+
return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/pdf/"
|
| 248 |
+
|
| 249 |
+
return None
|
| 250 |
+
|
| 251 |
+
except Exception as e:
|
| 252 |
+
logger.debug(f"PMC lookup failed for {paper.id}: {e}")
|
| 253 |
+
return None
|
| 254 |
+
|
| 255 |
+
def _get_unpaywall_url(self, doi: str) -> Optional[str]:
|
| 256 |
+
"""
|
| 257 |
+
Query Unpaywall API for open-access PDF URL.
|
| 258 |
+
|
| 259 |
+
Args:
|
| 260 |
+
doi: Paper DOI
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
PDF URL if found, None otherwise
|
| 264 |
+
"""
|
| 265 |
+
try:
|
| 266 |
+
url = f"https://api.unpaywall.org/v2/{doi}"
|
| 267 |
+
params = {"email": self.unpaywall_email}
|
| 268 |
+
|
| 269 |
+
response = requests.get(
|
| 270 |
+
url,
|
| 271 |
+
params=params,
|
| 272 |
+
headers=self.headers,
|
| 273 |
+
timeout=30
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
if response.status_code != 200:
|
| 277 |
+
logger.debug(f"Unpaywall returned {response.status_code} for {doi}")
|
| 278 |
+
return None
|
| 279 |
+
|
| 280 |
+
data = response.json()
|
| 281 |
+
|
| 282 |
+
# Check for best open access location
|
| 283 |
+
best_oa = data.get("best_oa_location")
|
| 284 |
+
if best_oa and best_oa.get("url_for_pdf"):
|
| 285 |
+
return best_oa["url_for_pdf"]
|
| 286 |
+
|
| 287 |
+
# Check all OA locations
|
| 288 |
+
oa_locations = data.get("oa_locations", [])
|
| 289 |
+
for loc in oa_locations:
|
| 290 |
+
if loc.get("url_for_pdf"):
|
| 291 |
+
return loc["url_for_pdf"]
|
| 292 |
+
|
| 293 |
+
return None
|
| 294 |
+
|
| 295 |
+
except Exception as e:
|
| 296 |
+
logger.debug(f"Unpaywall query failed for {doi}: {e}")
|
| 297 |
+
return None
|
| 298 |
+
|
| 299 |
+
def _download_pdf(self, url: str, save_path: Path) -> bool:
|
| 300 |
+
"""
|
| 301 |
+
Download PDF from URL with robust error handling.
|
| 302 |
+
|
| 303 |
+
Args:
|
| 304 |
+
url: PDF URL
|
| 305 |
+
save_path: Local path to save file
|
| 306 |
+
|
| 307 |
+
Returns:
|
| 308 |
+
True if successful, False otherwise
|
| 309 |
+
"""
|
| 310 |
+
try:
|
| 311 |
+
logger.debug(f"Downloading PDF from: {url}")
|
| 312 |
+
|
| 313 |
+
response = requests.get(
|
| 314 |
+
url,
|
| 315 |
+
headers=self.headers,
|
| 316 |
+
timeout=self.timeout,
|
| 317 |
+
stream=True,
|
| 318 |
+
allow_redirects=True
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
# Check for success
|
| 322 |
+
if response.status_code != 200:
|
| 323 |
+
logger.warning(f"Download failed with status {response.status_code}: {url}")
|
| 324 |
+
return False
|
| 325 |
+
|
| 326 |
+
# Verify it's a PDF (check content-type or magic bytes)
|
| 327 |
+
content_type = response.headers.get("content-type", "").lower()
|
| 328 |
+
if "pdf" not in content_type and "octet-stream" not in content_type:
|
| 329 |
+
# Check magic bytes as fallback
|
| 330 |
+
first_bytes = response.content[:8]
|
| 331 |
+
if not first_bytes.startswith(b"%PDF"):
|
| 332 |
+
logger.warning(f"Response is not a PDF: {content_type}")
|
| 333 |
+
return False
|
| 334 |
+
|
| 335 |
+
# Save to file
|
| 336 |
+
with open(save_path, "wb") as f:
|
| 337 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 338 |
+
f.write(chunk)
|
| 339 |
+
|
| 340 |
+
# Verify file was written
|
| 341 |
+
if save_path.exists() and save_path.stat().st_size > 0:
|
| 342 |
+
return True
|
| 343 |
+
|
| 344 |
+
return False
|
| 345 |
+
|
| 346 |
+
except requests.exceptions.Timeout:
|
| 347 |
+
logger.warning(f"Download timeout: {url}")
|
| 348 |
+
return False
|
| 349 |
+
except requests.exceptions.RequestException as e:
|
| 350 |
+
logger.warning(f"Download error: {e}")
|
| 351 |
+
return False
|
| 352 |
+
except Exception as e:
|
| 353 |
+
logger.error(f"Unexpected error downloading {url}: {e}")
|
| 354 |
+
return False
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def extract_text_from_pdf(pdf_path: str, max_pages: int = 100) -> Optional[str]:
|
| 358 |
+
"""
|
| 359 |
+
Extract text from PDF using pymupdf.
|
| 360 |
+
|
| 361 |
+
Args:
|
| 362 |
+
pdf_path: Path to PDF file
|
| 363 |
+
max_pages: Maximum pages to extract (default 5)
|
| 364 |
+
|
| 365 |
+
Returns:
|
| 366 |
+
Extracted text, or None if failed
|
| 367 |
+
"""
|
| 368 |
+
try:
|
| 369 |
+
import pymupdf # fitz
|
| 370 |
+
except ImportError:
|
| 371 |
+
try:
|
| 372 |
+
import fitz as pymupdf
|
| 373 |
+
except ImportError:
|
| 374 |
+
logger.error("pymupdf not installed. Run: pip install pymupdf")
|
| 375 |
+
return None
|
| 376 |
+
|
| 377 |
+
try:
|
| 378 |
+
doc = pymupdf.open(pdf_path)
|
| 379 |
+
text_parts: List[str] = []
|
| 380 |
+
|
| 381 |
+
pages_to_extract = min(len(doc), max_pages)
|
| 382 |
+
|
| 383 |
+
for page_num in range(pages_to_extract):
|
| 384 |
+
page = doc[page_num]
|
| 385 |
+
text = page.get_text()
|
| 386 |
+
if text:
|
| 387 |
+
text_parts.append(f"--- Page {page_num + 1} ---\n{text}")
|
| 388 |
+
|
| 389 |
+
doc.close()
|
| 390 |
+
|
| 391 |
+
full_text = "\n\n".join(text_parts)
|
| 392 |
+
logger.info(f"Extracted {len(full_text)} chars from {pages_to_extract} pages of {pdf_path}")
|
| 393 |
+
|
| 394 |
+
return full_text if full_text.strip() else None
|
| 395 |
+
|
| 396 |
+
except Exception as e:
|
| 397 |
+
logger.error(f"PDF text extraction failed for {pdf_path}: {e}")
|
| 398 |
+
return None
|
literature/schemas.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Domain-specific data models for literature mining.
|
| 3 |
+
Supports contextualized extraction with source traceability.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Optional, List, Dict, Any
|
| 6 |
+
from pydantic import BaseModel, Field, field_validator, model_validator, ConfigDict
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from enum import Enum
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DataQuality(str, Enum):
|
| 12 |
+
"""Data quality tier."""
|
| 13 |
+
GOLD = "gold" # Complete data with source quote
|
| 14 |
+
SILVER = "silver" # Partial data with source
|
| 15 |
+
BRONZE = "bronze" # Limited data or no source
|
| 16 |
+
ERROR = "error" # Extraction failed
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class QueryMode(str, Enum):
|
| 20 |
+
"""High-level search entrypoint modes."""
|
| 21 |
+
MATERIAL = "material-first"
|
| 22 |
+
PROPERTY = "property-first"
|
| 23 |
+
TASK = "task-first"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ReviewStatus(str, Enum):
|
| 27 |
+
"""Human review status for staged evidence."""
|
| 28 |
+
PENDING = "pending"
|
| 29 |
+
APPROVED = "approved"
|
| 30 |
+
REJECTED = "rejected"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class PaperSource(str, Enum):
|
| 34 |
+
"""Paper source identifier."""
|
| 35 |
+
PUBMED = "pubmed"
|
| 36 |
+
ARXIV = "arxiv"
|
| 37 |
+
SEMANTIC_SCHOLAR = "s2"
|
| 38 |
+
MANUAL = "manual"
|
| 39 |
+
UNKNOWN = "unknown"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class PaperMetadata(BaseModel):
|
| 43 |
+
"""Paper metadata from discovery."""
|
| 44 |
+
id: str = Field(..., description="Unique ID, format: {source}_{original_id}")
|
| 45 |
+
title: str
|
| 46 |
+
authors: List[str] = Field(default_factory=list)
|
| 47 |
+
year: Optional[int] = None
|
| 48 |
+
doi: Optional[str] = None
|
| 49 |
+
abstract: Optional[str] = None
|
| 50 |
+
venue: Optional[str] = None
|
| 51 |
+
citation_count: Optional[int] = None
|
| 52 |
+
is_open_access: Optional[bool] = None
|
| 53 |
+
source: PaperSource = PaperSource.UNKNOWN
|
| 54 |
+
url: Optional[str] = None
|
| 55 |
+
landing_url: Optional[str] = None
|
| 56 |
+
pdf_url: Optional[str] = None
|
| 57 |
+
pdf_path: Optional[str] = None
|
| 58 |
+
full_text: Optional[str] = None
|
| 59 |
+
match_reasons: List[str] = Field(default_factory=list)
|
| 60 |
+
background_status: Optional[str] = None
|
| 61 |
+
retrieved_at: datetime = Field(default_factory=datetime.now)
|
| 62 |
+
|
| 63 |
+
@field_validator('id')
|
| 64 |
+
@classmethod
|
| 65 |
+
def validate_id_format(cls, v: str) -> str:
|
| 66 |
+
"""Ensure ID format is correct."""
|
| 67 |
+
valid_prefixes = ['pubmed_', 'arxiv_', 's2_', 'manual_']
|
| 68 |
+
if not any(v.startswith(p) for p in valid_prefixes):
|
| 69 |
+
raise ValueError(f"ID must start with one of {valid_prefixes}")
|
| 70 |
+
return v
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class LiteratureQuerySpec(BaseModel):
|
| 74 |
+
"""Normalized query payload used by the production literature UI."""
|
| 75 |
+
mode: QueryMode
|
| 76 |
+
user_query: str
|
| 77 |
+
polymer_name: Optional[str] = None
|
| 78 |
+
canonical_smiles: Optional[str] = None
|
| 79 |
+
property_key: Optional[str] = None
|
| 80 |
+
project_id: Optional[str] = None
|
| 81 |
+
top_k_extract: int = Field(default=10, ge=1, le=50)
|
| 82 |
+
result_limit: int = Field(default=15, ge=1, le=100)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class PaperCardResult(BaseModel):
|
| 86 |
+
"""User-facing paper card summary."""
|
| 87 |
+
paper_id: str
|
| 88 |
+
title: str
|
| 89 |
+
year: Optional[int] = None
|
| 90 |
+
venue: Optional[str] = None
|
| 91 |
+
doi: Optional[str] = None
|
| 92 |
+
landing_url: Optional[str] = None
|
| 93 |
+
pdf_url: Optional[str] = None
|
| 94 |
+
is_open_access: bool = False
|
| 95 |
+
match_reasons: List[str] = Field(default_factory=list)
|
| 96 |
+
background_status: str = "discovered"
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class LiteratureSupportSummary(BaseModel):
|
| 100 |
+
"""Aggregated evidence coverage for a material/property view."""
|
| 101 |
+
matched_paper_count: int = 0
|
| 102 |
+
oa_paper_count: int = 0
|
| 103 |
+
evidence_record_count: int = 0
|
| 104 |
+
approved_record_count: int = 0
|
| 105 |
+
has_experimental_evidence: bool = False
|
| 106 |
+
literature_support_score: int = Field(default=0, ge=0, le=100)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class LiteratureEvidenceRecord(BaseModel):
|
| 110 |
+
"""Production staging record for extracted literature evidence."""
|
| 111 |
+
id: Optional[str] = None
|
| 112 |
+
project_id: Optional[str] = None
|
| 113 |
+
paper_id: str
|
| 114 |
+
material_name: str
|
| 115 |
+
canonical_smiles: Optional[str] = None
|
| 116 |
+
property_key: str
|
| 117 |
+
raw_value: str
|
| 118 |
+
raw_unit: str
|
| 119 |
+
standardized_value: Optional[float] = None
|
| 120 |
+
standardized_unit: Optional[str] = None
|
| 121 |
+
conditions_json: Dict[str, Any] = Field(default_factory=dict)
|
| 122 |
+
method: Optional[str] = None
|
| 123 |
+
evidence_quote: str
|
| 124 |
+
evidence_location: Optional[str] = None
|
| 125 |
+
extractor_version: str
|
| 126 |
+
extraction_model: Optional[str] = None
|
| 127 |
+
extraction_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
| 128 |
+
quality_tier: DataQuality = DataQuality.BRONZE
|
| 129 |
+
review_status: ReviewStatus = ReviewStatus.PENDING
|
| 130 |
+
reviewer_note: Optional[str] = None
|
| 131 |
+
edited_payload_json: Optional[Dict[str, Any]] = None
|
| 132 |
+
created_at: Optional[str] = None
|
| 133 |
+
updated_at: Optional[str] = None
|
| 134 |
+
|
| 135 |
+
@field_validator("evidence_quote")
|
| 136 |
+
@classmethod
|
| 137 |
+
def validate_evidence_quote(cls, v: str) -> str:
|
| 138 |
+
text = str(v or "").strip()
|
| 139 |
+
if len(text) < 10:
|
| 140 |
+
raise ValueError("evidence_quote must be at least 10 characters")
|
| 141 |
+
return text
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# ============== Experimental Conditions ==============
|
| 145 |
+
|
| 146 |
+
class ExperimentalConditions(BaseModel):
|
| 147 |
+
"""
|
| 148 |
+
Experimental conditions with full context.
|
| 149 |
+
|
| 150 |
+
⚠️ extra="allow" keeps LLM-returned fields like humidity, substrate, etc.
|
| 151 |
+
"""
|
| 152 |
+
model_config = ConfigDict(extra="allow")
|
| 153 |
+
|
| 154 |
+
# Preparation conditions
|
| 155 |
+
solvent: Optional[str] = None
|
| 156 |
+
concentration_mg_ml: Optional[float] = None
|
| 157 |
+
spin_speed_rpm: Optional[int] = None
|
| 158 |
+
spin_time_s: Optional[int] = None
|
| 159 |
+
annealing_temp_c: Optional[float] = None
|
| 160 |
+
annealing_time_min: Optional[float] = None
|
| 161 |
+
annealing_atmosphere: Optional[str] = None
|
| 162 |
+
film_thickness_nm: Optional[float] = None
|
| 163 |
+
|
| 164 |
+
# Measurement conditions
|
| 165 |
+
measurement_temp_k: Optional[float] = Field(None, description="Measurement temperature (K)")
|
| 166 |
+
measurement_method: Optional[str] = None
|
| 167 |
+
measurement_direction: Optional[str] = None # in-plane, cross-plane
|
| 168 |
+
|
| 169 |
+
def to_dict(self) -> dict:
|
| 170 |
+
"""Convert to dict, excluding None values."""
|
| 171 |
+
return {k: v for k, v in self.model_dump().items() if v is not None}
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# ============== Contextualized Value ==============
|
| 175 |
+
|
| 176 |
+
class ContextualizedValue(BaseModel):
|
| 177 |
+
"""
|
| 178 |
+
Measurement value with full experimental context and source traceability.
|
| 179 |
+
|
| 180 |
+
Design principles:
|
| 181 |
+
- Same paper may report multiple values under different conditions
|
| 182 |
+
- Each value MUST have its associated experimental conditions
|
| 183 |
+
- MANDATORY: source_quote for traceability
|
| 184 |
+
"""
|
| 185 |
+
model_config = ConfigDict(extra="allow")
|
| 186 |
+
|
| 187 |
+
# Material
|
| 188 |
+
polymer_name: str = Field(..., description="Polymer name e.g. PEDOT:PSS")
|
| 189 |
+
dopant: Optional[str] = None
|
| 190 |
+
dopant_ratio: Optional[str] = None
|
| 191 |
+
|
| 192 |
+
# Property measured
|
| 193 |
+
property_name: str = Field(..., description="Property name e.g. electrical_conductivity")
|
| 194 |
+
|
| 195 |
+
# Raw value
|
| 196 |
+
raw_value: str = Field(..., description="Raw value string from paper")
|
| 197 |
+
raw_unit: str = Field(..., description="Original unit from paper")
|
| 198 |
+
|
| 199 |
+
# Standardized value (filled by Standardizer)
|
| 200 |
+
standardized_value: Optional[float] = None
|
| 201 |
+
standardized_unit: Optional[str] = None
|
| 202 |
+
standardization_error: Optional[str] = None
|
| 203 |
+
|
| 204 |
+
# Experimental conditions
|
| 205 |
+
conditions: ExperimentalConditions = Field(default_factory=ExperimentalConditions)
|
| 206 |
+
|
| 207 |
+
# Source traceability (MANDATORY!)
|
| 208 |
+
source_quote: str = Field(..., description="Exact quote from paper containing this value")
|
| 209 |
+
source_location: Optional[str] = Field(None, description="Table 1, Figure 3a, etc.")
|
| 210 |
+
|
| 211 |
+
# Quality
|
| 212 |
+
extraction_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
| 213 |
+
quality_tier: DataQuality = DataQuality.BRONZE
|
| 214 |
+
|
| 215 |
+
@field_validator('source_quote')
|
| 216 |
+
@classmethod
|
| 217 |
+
def quote_not_empty(cls, v: str) -> str:
|
| 218 |
+
if not v or len(v.strip()) < 10:
|
| 219 |
+
raise ValueError("source_quote must be >10 chars")
|
| 220 |
+
return v.strip()
|
| 221 |
+
|
| 222 |
+
def to_db_dict(self) -> dict:
|
| 223 |
+
"""Convert to database storage format."""
|
| 224 |
+
return {
|
| 225 |
+
"polymer_name": self.polymer_name,
|
| 226 |
+
"dopant": self.dopant,
|
| 227 |
+
"dopant_ratio": self.dopant_ratio,
|
| 228 |
+
"property_name": self.property_name,
|
| 229 |
+
"raw_value": self.raw_value,
|
| 230 |
+
"raw_unit": self.raw_unit,
|
| 231 |
+
"standardized_value": self.standardized_value,
|
| 232 |
+
"standardized_unit": self.standardized_unit,
|
| 233 |
+
"conditions": self.conditions.to_dict(),
|
| 234 |
+
"source_quote": self.source_quote,
|
| 235 |
+
"source_location": self.source_location,
|
| 236 |
+
"extraction_confidence": self.extraction_confidence,
|
| 237 |
+
"quality_tier": self.quality_tier.value,
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# ============== Legacy PolymerDataPoint (for compatibility) ==============
|
| 242 |
+
|
| 243 |
+
class PolymerDataPoint(BaseModel):
|
| 244 |
+
"""Single data point extracted from literature (legacy format)."""
|
| 245 |
+
# Material Information
|
| 246 |
+
polymer_name: str = Field(..., description="Polymer name, e.g. P3HT, PEDOT:PSS")
|
| 247 |
+
polymer_class: Optional[str] = Field(None, description="Polymer class")
|
| 248 |
+
dopant: Optional[str] = None
|
| 249 |
+
dopant_ratio: Optional[str] = None
|
| 250 |
+
|
| 251 |
+
# Processing Conditions
|
| 252 |
+
solvent: Optional[str] = None
|
| 253 |
+
concentration_mg_ml: Optional[float] = None
|
| 254 |
+
spin_speed_rpm: Optional[int] = None
|
| 255 |
+
spin_time_s: Optional[int] = None
|
| 256 |
+
annealing_temp_c: Optional[float] = None
|
| 257 |
+
annealing_time_min: Optional[float] = None
|
| 258 |
+
annealing_atmosphere: Optional[str] = None
|
| 259 |
+
film_thickness_nm: Optional[float] = None
|
| 260 |
+
|
| 261 |
+
# Electrical Properties
|
| 262 |
+
electrical_conductivity_s_cm: Optional[float] = None
|
| 263 |
+
seebeck_coefficient_uv_k: Optional[float] = None
|
| 264 |
+
power_factor_uw_m_k2: Optional[float] = None
|
| 265 |
+
|
| 266 |
+
# Thermal Properties
|
| 267 |
+
thermal_conductivity_w_mk: Optional[float] = None
|
| 268 |
+
zt_figure_of_merit: Optional[float] = None
|
| 269 |
+
|
| 270 |
+
# Structural
|
| 271 |
+
xrd_crystallinity_percent: Optional[float] = None
|
| 272 |
+
xrd_pi_stacking_angstrom: Optional[float] = None
|
| 273 |
+
xrd_lamellar_spacing_angstrom: Optional[float] = None
|
| 274 |
+
|
| 275 |
+
# Metadata
|
| 276 |
+
source_paper_id: str
|
| 277 |
+
source_table_or_figure: Optional[str] = None
|
| 278 |
+
extraction_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
| 279 |
+
quality_tier: DataQuality = DataQuality.BRONZE
|
| 280 |
+
raw_text_snippet: Optional[str] = None
|
| 281 |
+
|
| 282 |
+
@field_validator('electrical_conductivity_s_cm', 'thermal_conductivity_w_mk', mode='before')
|
| 283 |
+
@classmethod
|
| 284 |
+
def validate_positive(cls, v: Any) -> Optional[float]:
|
| 285 |
+
if v is not None and isinstance(v, (int, float)) and v < 0:
|
| 286 |
+
return None
|
| 287 |
+
return v
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
# ============== Extraction Result ==============
|
| 291 |
+
|
| 292 |
+
class ExtractionResult(BaseModel):
|
| 293 |
+
"""
|
| 294 |
+
Extraction result for a single paper.
|
| 295 |
+
|
| 296 |
+
Supports both old format (paper=PaperMetadata) and new format (paper_id, paper_title).
|
| 297 |
+
"""
|
| 298 |
+
model_config = ConfigDict(extra="allow")
|
| 299 |
+
|
| 300 |
+
# New format fields (preferred)
|
| 301 |
+
paper_id: Optional[str] = None
|
| 302 |
+
paper_title: Optional[str] = None
|
| 303 |
+
|
| 304 |
+
# Old format field (for backward compatibility)
|
| 305 |
+
paper: Optional[PaperMetadata] = None
|
| 306 |
+
|
| 307 |
+
# Common fields
|
| 308 |
+
data_points: List = Field(default_factory=list) # Can be ContextualizedValue or PolymerDataPoint
|
| 309 |
+
extraction_model: str = "unknown"
|
| 310 |
+
extraction_timestamp: Any = Field(default_factory=lambda: datetime.now().isoformat())
|
| 311 |
+
success: bool = True
|
| 312 |
+
error_message: Optional[str] = None
|
| 313 |
+
|
| 314 |
+
# Legacy fields
|
| 315 |
+
llm_model_used: Optional[str] = None
|
| 316 |
+
extraction_notes: Optional[str] = None
|
| 317 |
+
|
| 318 |
+
@model_validator(mode='after')
|
| 319 |
+
def extract_paper_fields(self):
|
| 320 |
+
"""Extract paper_id and paper_title from paper if not provided."""
|
| 321 |
+
if self.paper is not None:
|
| 322 |
+
if self.paper_id is None:
|
| 323 |
+
self.paper_id = self.paper.id
|
| 324 |
+
if self.paper_title is None:
|
| 325 |
+
self.paper_title = self.paper.title
|
| 326 |
+
# Copy llm_model_used to extraction_model if present
|
| 327 |
+
if self.llm_model_used and self.extraction_model == "unknown":
|
| 328 |
+
self.extraction_model = self.llm_model_used
|
| 329 |
+
return self
|
literature/standardizer.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit standardization for production literature evidence.
|
| 3 |
+
|
| 4 |
+
The standard units are aligned with the platform property catalog so extracted
|
| 5 |
+
evidence can be compared and filtered consistently before human review.
|
| 6 |
+
"""
|
| 7 |
+
import logging
|
| 8 |
+
import re
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from typing import Callable, Dict, List, Optional
|
| 11 |
+
|
| 12 |
+
from .property_registry import PROPERTY_CATALOG
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class StandardizationResult:
|
| 19 |
+
"""Standardization result."""
|
| 20 |
+
success: bool
|
| 21 |
+
value: Optional[float] = None
|
| 22 |
+
unit: Optional[str] = None
|
| 23 |
+
error: Optional[str] = None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def normalize_minus_signs(s: str) -> str:
|
| 27 |
+
"""Normalize all Unicode minus signs to ASCII hyphen-minus."""
|
| 28 |
+
minus_chars = [
|
| 29 |
+
"−", "–", "—", "‐", "‑", "‒", "⁻", "₋", "➖",
|
| 30 |
+
]
|
| 31 |
+
for char in minus_chars:
|
| 32 |
+
s = s.replace(char, "-")
|
| 33 |
+
return s
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _identity(value: float) -> float:
|
| 37 |
+
return value
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _mul(factor: float) -> Callable[[float], float]:
|
| 41 |
+
return lambda value: value * factor
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _add(delta: float) -> Callable[[float], float]:
|
| 45 |
+
return lambda value: value + delta
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class UnitStandardizer:
|
| 49 |
+
"""Convert raw values from papers to platform-standard units."""
|
| 50 |
+
|
| 51 |
+
STANDARD_UNITS = {key: meta["unit"] for key, meta in PROPERTY_CATALOG.items()}
|
| 52 |
+
|
| 53 |
+
UNIT_ALIASES = {
|
| 54 |
+
# Temperature
|
| 55 |
+
"k": "K",
|
| 56 |
+
"kelvin": "K",
|
| 57 |
+
"c": "C",
|
| 58 |
+
"°c": "C",
|
| 59 |
+
"deg c": "C",
|
| 60 |
+
"celsius": "C",
|
| 61 |
+
# Thermal
|
| 62 |
+
"w/mk": "W/(m*K)",
|
| 63 |
+
"w/(m·k)": "W/(m*K)",
|
| 64 |
+
"w m-1 k-1": "W/(m*K)",
|
| 65 |
+
"w·m⁻¹·k⁻¹": "W/(m*K)",
|
| 66 |
+
"mw/(m*k)": "mW/(m*K)",
|
| 67 |
+
"mw/(m·k)": "mW/(m*K)",
|
| 68 |
+
"j/kgk": "J/(kg*K)",
|
| 69 |
+
"j/(kg·k)": "J/(kg*K)",
|
| 70 |
+
"j/(kg*k)": "J/(kg*K)",
|
| 71 |
+
"j/gk": "J/(g*K)",
|
| 72 |
+
"j/(g*k)": "J/(g*K)",
|
| 73 |
+
# Mechanical
|
| 74 |
+
"gpa": "GPa",
|
| 75 |
+
"mpa": "MPa",
|
| 76 |
+
# Transport / physical
|
| 77 |
+
"pa s": "Pa*s",
|
| 78 |
+
"pa·s": "Pa*s",
|
| 79 |
+
"pas": "Pa*s",
|
| 80 |
+
"mpa*s": "mPa*s",
|
| 81 |
+
"cm2/s": "cm^2/s",
|
| 82 |
+
"cm^2/s": "cm^2/s",
|
| 83 |
+
"mm2/s": "mm^2/s",
|
| 84 |
+
"mm^2/s": "mm^2/s",
|
| 85 |
+
"g/cm3": "g/cm^3",
|
| 86 |
+
"g/cm^3": "g/cm^3",
|
| 87 |
+
"kg/m3": "kg/m^3",
|
| 88 |
+
"kg/m^3": "kg/m^3",
|
| 89 |
+
"ang": "Angstrom",
|
| 90 |
+
"angstrom": "Angstrom",
|
| 91 |
+
"å": "Angstrom",
|
| 92 |
+
"nm": "nm",
|
| 93 |
+
# Electronics
|
| 94 |
+
"ev": "eV",
|
| 95 |
+
"a.u.": "a.u.",
|
| 96 |
+
"au": "a.u.",
|
| 97 |
+
"debye": "Debye",
|
| 98 |
+
# Gas / transport
|
| 99 |
+
"barrer": "Barrer",
|
| 100 |
+
# Extended literature properties
|
| 101 |
+
"s/cm": "S/cm",
|
| 102 |
+
"s m-1": "S/m",
|
| 103 |
+
"s/m": "S/m",
|
| 104 |
+
"uv/k": "uV/K",
|
| 105 |
+
"μv/k": "uV/K",
|
| 106 |
+
"µv/k": "uV/K",
|
| 107 |
+
"mv/k": "mV/K",
|
| 108 |
+
"uw/(m*k^2)": "uW/(m*K^2)",
|
| 109 |
+
"uw/(m*k**2)": "uW/(m*K^2)",
|
| 110 |
+
"uw/(m·k²)": "uW/(m*K^2)",
|
| 111 |
+
"mw/(m*k^2)": "mW/(m*K^2)",
|
| 112 |
+
"%": "%",
|
| 113 |
+
"dimensionless": "",
|
| 114 |
+
"-": "",
|
| 115 |
+
"": "",
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
CONVERSIONS: Dict[str, Dict[tuple[str, str], Callable[[float], float]]] = {
|
| 119 |
+
"tm": {("C", "K"): _add(273.15)},
|
| 120 |
+
"tg": {("C", "K"): _add(273.15)},
|
| 121 |
+
"cp": {("J/(g*K)", "J/(kg*K)"): _mul(1000.0)},
|
| 122 |
+
"tc": {("mW/(m*K)", "W/(m*K)"): _mul(0.001)},
|
| 123 |
+
"young": {("MPa", "GPa"): _mul(0.001)},
|
| 124 |
+
"shear": {("MPa", "GPa"): _mul(0.001)},
|
| 125 |
+
"bulk": {("MPa", "GPa"): _mul(0.001)},
|
| 126 |
+
"visc": {("mPa*s", "Pa*s"): _mul(0.001)},
|
| 127 |
+
"dif": {("mm^2/s", "cm^2/s"): _mul(0.01)},
|
| 128 |
+
"rho": {("kg/m^3", "g/cm^3"): _mul(0.001)},
|
| 129 |
+
"rg": {("nm", "Angstrom"): _mul(10.0)},
|
| 130 |
+
"electrical_conductivity": {("S/m", "S/cm"): _mul(0.01)},
|
| 131 |
+
"seebeck_coefficient": {("mV/K", "uV/K"): _mul(1000.0)},
|
| 132 |
+
"power_factor": {("mW/(m*K^2)", "uW/(m*K^2)"): _mul(1000.0)},
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
def standardize(
|
| 136 |
+
self,
|
| 137 |
+
property_name: str,
|
| 138 |
+
raw_value: str,
|
| 139 |
+
raw_unit: str,
|
| 140 |
+
) -> StandardizationResult:
|
| 141 |
+
try:
|
| 142 |
+
numeric = self._parse_numeric(raw_value)
|
| 143 |
+
except ValueError as exc:
|
| 144 |
+
return StandardizationResult(success=False, error=f"Parse error: {exc}")
|
| 145 |
+
|
| 146 |
+
standard_unit = self.STANDARD_UNITS.get(property_name)
|
| 147 |
+
if standard_unit is None:
|
| 148 |
+
return StandardizationResult(success=False, error=f"Unknown property: {property_name}")
|
| 149 |
+
|
| 150 |
+
normalized = self._normalize_unit(raw_unit)
|
| 151 |
+
if standard_unit in {"dimensionless", ""}:
|
| 152 |
+
return StandardizationResult(success=True, value=numeric, unit="")
|
| 153 |
+
|
| 154 |
+
if normalized == standard_unit:
|
| 155 |
+
return StandardizationResult(success=True, value=numeric, unit=standard_unit)
|
| 156 |
+
|
| 157 |
+
transform = self.CONVERSIONS.get(property_name, {}).get((normalized, standard_unit))
|
| 158 |
+
if transform is not None:
|
| 159 |
+
return StandardizationResult(success=True, value=transform(numeric), unit=standard_unit)
|
| 160 |
+
|
| 161 |
+
if normalized == "":
|
| 162 |
+
return StandardizationResult(success=False, error=f"Missing unit for {property_name}")
|
| 163 |
+
|
| 164 |
+
return StandardizationResult(
|
| 165 |
+
success=False,
|
| 166 |
+
error=f"Cannot convert {normalized} to {standard_unit} for {property_name}",
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
def _parse_numeric(self, value_str: str) -> float:
|
| 170 |
+
s = normalize_minus_signs(str(value_str or "").strip())
|
| 171 |
+
s = re.sub(r"\s*[×x]\s*10\^?\s*(-?\d+)", r"e\1", s)
|
| 172 |
+
superscripts = {
|
| 173 |
+
"⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4",
|
| 174 |
+
"⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "⁻": "-",
|
| 175 |
+
}
|
| 176 |
+
for sup, norm in superscripts.items():
|
| 177 |
+
s = s.replace(sup, norm)
|
| 178 |
+
s = s.replace(" ", "")
|
| 179 |
+
|
| 180 |
+
range_match = re.match(r"^(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)$", s)
|
| 181 |
+
if range_match:
|
| 182 |
+
low = float(range_match.group(1))
|
| 183 |
+
high = float(range_match.group(2))
|
| 184 |
+
return (low + high) / 2
|
| 185 |
+
|
| 186 |
+
pm_match = re.match(r"^([\d.eE+-]+)\s*[±]\s*[\d.eE+-]+$", s)
|
| 187 |
+
if pm_match:
|
| 188 |
+
return float(pm_match.group(1))
|
| 189 |
+
|
| 190 |
+
return float(s)
|
| 191 |
+
|
| 192 |
+
def _normalize_unit(self, unit: str) -> str:
|
| 193 |
+
normalized = normalize_minus_signs(str(unit or "").strip())
|
| 194 |
+
normalized = normalized.replace("²", "^2").replace("³", "^3")
|
| 195 |
+
normalized = normalized.replace("·", "*").replace(" ", " ")
|
| 196 |
+
key = re.sub(r"\s+", " ", normalized.lower()).strip()
|
| 197 |
+
return self.UNIT_ALIASES.get(key, normalized)
|
| 198 |
+
|
| 199 |
+
def standardize_data_points(self, data_points: List) -> List:
|
| 200 |
+
for dp in data_points:
|
| 201 |
+
result = self.standardize(
|
| 202 |
+
property_name=dp.property_name,
|
| 203 |
+
raw_value=dp.raw_value,
|
| 204 |
+
raw_unit=dp.raw_unit,
|
| 205 |
+
)
|
| 206 |
+
if result.success:
|
| 207 |
+
dp.standardized_value = result.value
|
| 208 |
+
dp.standardized_unit = result.unit
|
| 209 |
+
else:
|
| 210 |
+
dp.standardization_error = result.error
|
| 211 |
+
return data_points
|
scripts/__pycache__/run_literature_mining.cpython-313.pyc
ADDED
|
Binary file (7.79 kB). View file
|
|
|
scripts/evaluate_polyie.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from literature.evaluation import evaluate_predictions, load_json_records
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def main() -> None:
|
| 12 |
+
parser = argparse.ArgumentParser(description="Evaluate extraction output against a POLYIE-style gold file.")
|
| 13 |
+
parser.add_argument("--gold", required=True, help="Gold file (.json or .jsonl)")
|
| 14 |
+
parser.add_argument("--pred", required=True, help="Prediction file (.json or .jsonl)")
|
| 15 |
+
parser.add_argument("--out", default=None, help="Optional JSON output path")
|
| 16 |
+
args = parser.parse_args()
|
| 17 |
+
|
| 18 |
+
gold_records = load_json_records(args.gold)
|
| 19 |
+
predicted_records = load_json_records(args.pred)
|
| 20 |
+
metrics = evaluate_predictions(gold_records, predicted_records)
|
| 21 |
+
text = json.dumps(metrics, indent=2, ensure_ascii=False)
|
| 22 |
+
print(text)
|
| 23 |
+
|
| 24 |
+
if args.out:
|
| 25 |
+
Path(args.out).write_text(text + "\n", encoding="utf-8")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
if __name__ == "__main__":
|
| 29 |
+
main()
|
scripts/run_literature_mining.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Project-based literature mining CLI.
|
| 4 |
+
|
| 5 |
+
Examples:
|
| 6 |
+
python scripts/run_literature_mining.py --query "PEDOT:PSS thermoelectric" --limit 5
|
| 7 |
+
python scripts/run_literature_mining.py --project-id proj_xxx --query "P3HT conductivity" --save-mode files
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import csv
|
| 13 |
+
import json
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Any, Dict, List
|
| 16 |
+
|
| 17 |
+
from dotenv import load_dotenv
|
| 18 |
+
|
| 19 |
+
from src.literature_service import (
|
| 20 |
+
DataPointRepo,
|
| 21 |
+
LiteraturePipeline,
|
| 22 |
+
ProjectRepo,
|
| 23 |
+
QueryIntentService,
|
| 24 |
+
QuerySessionRepo,
|
| 25 |
+
get_database,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
load_dotenv()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def resolve_project_id(project_id: str | None, projects: ProjectRepo) -> str:
|
| 32 |
+
if project_id:
|
| 33 |
+
project = projects.get_project(project_id)
|
| 34 |
+
if not project:
|
| 35 |
+
raise ValueError(f"Project not found: {project_id}")
|
| 36 |
+
return project_id
|
| 37 |
+
|
| 38 |
+
existing = projects.list_projects()
|
| 39 |
+
if existing:
|
| 40 |
+
return existing[0]["id"]
|
| 41 |
+
|
| 42 |
+
created = projects.create_project(
|
| 43 |
+
name="Default Literature Project",
|
| 44 |
+
description="Auto-created by run_literature_mining.py",
|
| 45 |
+
)
|
| 46 |
+
return created["id"]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def export_points_to_files(project_id: str, points: List[Dict[str, Any]], out_dir: Path) -> None:
|
| 50 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 51 |
+
|
| 52 |
+
jsonl_path = out_dir / "validated_points.jsonl"
|
| 53 |
+
with jsonl_path.open("w", encoding="utf-8") as f:
|
| 54 |
+
for row in points:
|
| 55 |
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 56 |
+
|
| 57 |
+
csv_path = out_dir / "validated_points.csv"
|
| 58 |
+
if points:
|
| 59 |
+
with csv_path.open("w", newline="", encoding="utf-8") as f:
|
| 60 |
+
writer = csv.DictWriter(f, fieldnames=list(points[0].keys()))
|
| 61 |
+
writer.writeheader()
|
| 62 |
+
writer.writerows(points)
|
| 63 |
+
else:
|
| 64 |
+
csv_path.write_text("point_id,project_id\n", encoding="utf-8")
|
| 65 |
+
|
| 66 |
+
print(f"Exported {len(points)} rows to:")
|
| 67 |
+
print(f" - {jsonl_path}")
|
| 68 |
+
print(f" - {csv_path}")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def main() -> None:
|
| 72 |
+
parser = argparse.ArgumentParser(description="Project-based Literature Mining CLI")
|
| 73 |
+
parser.add_argument("--project-id", default=None, help="Target project ID")
|
| 74 |
+
parser.add_argument("--query", default="PEDOT:PSS thermoelectric conductivity", help="Search query")
|
| 75 |
+
parser.add_argument("--limit", type=int, default=5, help="Max papers per source")
|
| 76 |
+
parser.add_argument("--strategy", choices=["simple", "paperqa"], default="simple", help="Extraction strategy")
|
| 77 |
+
parser.add_argument("--model-provider", default="openai_compatible", help="Model provider name")
|
| 78 |
+
parser.add_argument("--model-name", default="gpt-oss:latest", help="Model name")
|
| 79 |
+
parser.add_argument("--save-mode", choices=["sqlite", "files"], default="sqlite", help="Result sink mode")
|
| 80 |
+
parser.add_argument("--no-save", action="store_true", help="Do not persist result to sqlite")
|
| 81 |
+
parser.add_argument("--manual-upload-dir", default="data/literature/manual_uploads", help="Reserved for batch manual upload")
|
| 82 |
+
args = parser.parse_args()
|
| 83 |
+
|
| 84 |
+
db = get_database("data/app.db")
|
| 85 |
+
project_repo = ProjectRepo(db)
|
| 86 |
+
point_repo = DataPointRepo(db)
|
| 87 |
+
query_repo = QuerySessionRepo(db)
|
| 88 |
+
query_intent = QueryIntentService(query_repo)
|
| 89 |
+
pipeline = LiteraturePipeline(db_path="data/app.db")
|
| 90 |
+
|
| 91 |
+
target_project_id = resolve_project_id(args.project_id, project_repo)
|
| 92 |
+
project = project_repo.get_project(target_project_id)
|
| 93 |
+
print("=" * 64)
|
| 94 |
+
print("Project-Based Literature Mining")
|
| 95 |
+
print(f"Project: {project['name']} ({target_project_id})")
|
| 96 |
+
print(f"Query: {args.query}")
|
| 97 |
+
print(f"Limit per source: {args.limit}")
|
| 98 |
+
print(f"Strategy: {args.strategy}")
|
| 99 |
+
print("=" * 64)
|
| 100 |
+
|
| 101 |
+
query_session = query_intent.analyze_and_store(target_project_id, args.query)
|
| 102 |
+
suggestions = json.loads(query_session.get("suggestions_json") or "[]")
|
| 103 |
+
if suggestions:
|
| 104 |
+
print("Query suggestions:")
|
| 105 |
+
for s in suggestions:
|
| 106 |
+
print(f" - {s}")
|
| 107 |
+
if query_session.get("clarification_required"):
|
| 108 |
+
print("Note: query marked as pending_clarification. Continuing by CLI override.")
|
| 109 |
+
|
| 110 |
+
if args.no_save:
|
| 111 |
+
discovered = pipeline.run_discovery(target_project_id, args.query, args.limit)
|
| 112 |
+
retrieved = pipeline.run_retrieval(target_project_id, discovered)
|
| 113 |
+
stats = pipeline.run_extraction(
|
| 114 |
+
target_project_id,
|
| 115 |
+
run_id=None,
|
| 116 |
+
paper_rows=retrieved,
|
| 117 |
+
strategy=args.strategy,
|
| 118 |
+
model_name=args.model_name,
|
| 119 |
+
use_full_text=True,
|
| 120 |
+
)
|
| 121 |
+
print(f"Extraction complete without DB run record: {stats}")
|
| 122 |
+
else:
|
| 123 |
+
result = pipeline.run_full_pipeline(
|
| 124 |
+
project_id=target_project_id,
|
| 125 |
+
query=args.query,
|
| 126 |
+
limit=args.limit,
|
| 127 |
+
strategy=args.strategy,
|
| 128 |
+
model_provider=args.model_provider,
|
| 129 |
+
model_name=args.model_name,
|
| 130 |
+
use_full_text=True,
|
| 131 |
+
)
|
| 132 |
+
print(f"Pipeline status: {result.get('status')}")
|
| 133 |
+
if result.get("status") != "completed":
|
| 134 |
+
print(f"Error: {result.get('error')}")
|
| 135 |
+
else:
|
| 136 |
+
print(json.dumps(result.get("stats", {}), indent=2))
|
| 137 |
+
|
| 138 |
+
points = point_repo.list_points(target_project_id)
|
| 139 |
+
if args.save_mode == "files":
|
| 140 |
+
run_dir = Path("data/literature/runs")
|
| 141 |
+
export_points_to_files(target_project_id, points, run_dir)
|
| 142 |
+
|
| 143 |
+
print("=" * 64)
|
| 144 |
+
print("Done.")
|
| 145 |
+
print("=" * 64)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
if __name__ == "__main__":
|
| 149 |
+
main()
|
scripts/train_prior_slurm.sh
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#SBATCH --job-name=polymer_prior
|
| 3 |
+
#SBATCH --nodes=1
|
| 4 |
+
#SBATCH --gres=gpu:4
|
| 5 |
+
#SBATCH --cpus-per-task=16
|
| 6 |
+
#SBATCH --mem=64G
|
| 7 |
+
#SBATCH --time=24:00:00
|
| 8 |
+
#SBATCH --output=logs/train_prior_%j.out
|
| 9 |
+
#SBATCH --error=logs/train_prior_%j.err
|
| 10 |
+
|
| 11 |
+
set -euo pipefail
|
| 12 |
+
|
| 13 |
+
# Adjust these for your CRC environment
|
| 14 |
+
REPO_DIR="/Users/xuguoyue/Documents/GitHub/POLYMER-PROPERTY"
|
| 15 |
+
VENV_DIR="$REPO_DIR/.venv"
|
| 16 |
+
|
| 17 |
+
cd "$REPO_DIR"
|
| 18 |
+
|
| 19 |
+
# Load modules if your CRC requires it (example)
|
| 20 |
+
# module load python/3.10
|
| 21 |
+
|
| 22 |
+
source "$VENV_DIR/bin/activate"
|
| 23 |
+
|
| 24 |
+
mkdir -p logs
|
| 25 |
+
|
| 26 |
+
export OMP_NUM_THREADS=8
|
| 27 |
+
export MKL_NUM_THREADS=8
|
| 28 |
+
|
| 29 |
+
torchrun --nproc_per_node=4 RNN/train_prior.py \
|
| 30 |
+
--smiles-csv data/PI1M.csv \
|
| 31 |
+
--vocab RNN/pretrained_model/voc \
|
| 32 |
+
--output RNN/pretrained_model/Prior.ckpt \
|
| 33 |
+
--epochs 10 \
|
| 34 |
+
--batch-size 256 \
|
| 35 |
+
--lr 1e-3 \
|
| 36 |
+
--max-length 140 \
|
| 37 |
+
--num-workers 4 \
|
| 38 |
+
--log-every 200
|
src/.DS_Store
ADDED
|
Binary file (10.2 kB). View file
|
|
|
src/__pycache__/conv.cpython-310.pyc
ADDED
|
Binary file (7.21 kB). View file
|
|
|
src/__pycache__/conv.cpython-313.pyc
ADDED
|
Binary file (11.2 kB). View file
|
|
|
src/__pycache__/data_builder.cpython-310.pyc
ADDED
|
Binary file (24 kB). View file
|
|
|
src/__pycache__/data_builder.cpython-313.pyc
ADDED
|
Binary file (40.6 kB). View file
|
|
|
src/__pycache__/discover_llm.cpython-310.pyc
ADDED
|
Binary file (23 kB). View file
|
|
|
src/__pycache__/discover_llm.cpython-313.pyc
ADDED
|
Binary file (37.7 kB). View file
|
|
|
src/__pycache__/discovery.cpython-310.pyc
ADDED
|
Binary file (21 kB). View file
|
|
|
src/__pycache__/discovery.cpython-313.pyc
ADDED
|
Binary file (34.6 kB). View file
|
|
|