sobinalosious92 commited on
Commit
3f4ebee
·
verified ·
1 Parent(s): a22718f

Upload 119 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. literature/__init__.py +100 -0
  2. literature/__pycache__/__init__.cpython-310.pyc +0 -0
  3. literature/__pycache__/__init__.cpython-313.pyc +0 -0
  4. literature/__pycache__/clarifier.cpython-310.pyc +0 -0
  5. literature/__pycache__/clarifier.cpython-313.pyc +0 -0
  6. literature/__pycache__/config.cpython-310.pyc +0 -0
  7. literature/__pycache__/config.cpython-313.pyc +0 -0
  8. literature/__pycache__/converters.cpython-310.pyc +0 -0
  9. literature/__pycache__/converters.cpython-313.pyc +0 -0
  10. literature/__pycache__/discovery.cpython-310.pyc +0 -0
  11. literature/__pycache__/discovery.cpython-313.pyc +0 -0
  12. literature/__pycache__/evaluation.cpython-310.pyc +0 -0
  13. literature/__pycache__/extraction.cpython-310.pyc +0 -0
  14. literature/__pycache__/extraction.cpython-313.pyc +0 -0
  15. literature/__pycache__/graph.cpython-313.pyc +0 -0
  16. literature/__pycache__/property_registry.cpython-310.pyc +0 -0
  17. literature/__pycache__/property_registry.cpython-313.pyc +0 -0
  18. literature/__pycache__/quality.cpython-310.pyc +0 -0
  19. literature/__pycache__/quality.cpython-313.pyc +0 -0
  20. literature/__pycache__/retrieval.cpython-310.pyc +0 -0
  21. literature/__pycache__/retrieval.cpython-313.pyc +0 -0
  22. literature/__pycache__/schemas.cpython-310.pyc +0 -0
  23. literature/__pycache__/schemas.cpython-313.pyc +0 -0
  24. literature/__pycache__/standardizer.cpython-310.pyc +0 -0
  25. literature/__pycache__/standardizer.cpython-313.pyc +0 -0
  26. literature/clarifier.py +89 -0
  27. literature/config.py +71 -0
  28. literature/converters.py +56 -0
  29. literature/discovery.py +380 -0
  30. literature/evaluation.py +155 -0
  31. literature/extraction.py +863 -0
  32. literature/graph.py +450 -0
  33. literature/property_registry.py +274 -0
  34. literature/quality.py +176 -0
  35. literature/retrieval.py +398 -0
  36. literature/schemas.py +329 -0
  37. literature/standardizer.py +211 -0
  38. scripts/__pycache__/run_literature_mining.cpython-313.pyc +0 -0
  39. scripts/evaluate_polyie.py +29 -0
  40. scripts/run_literature_mining.py +149 -0
  41. scripts/train_prior_slurm.sh +38 -0
  42. src/.DS_Store +0 -0
  43. src/__pycache__/conv.cpython-310.pyc +0 -0
  44. src/__pycache__/conv.cpython-313.pyc +0 -0
  45. src/__pycache__/data_builder.cpython-310.pyc +0 -0
  46. src/__pycache__/data_builder.cpython-313.pyc +0 -0
  47. src/__pycache__/discover_llm.cpython-310.pyc +0 -0
  48. src/__pycache__/discover_llm.cpython-313.pyc +0 -0
  49. src/__pycache__/discovery.cpython-310.pyc +0 -0
  50. src/__pycache__/discovery.cpython-313.pyc +0 -0
literature/__init__.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Literature mining package for project-based extraction workflows."""
2
+
3
+ from .schemas import (
4
+ ContextualizedValue,
5
+ DataQuality,
6
+ ExperimentalConditions,
7
+ ExtractionResult,
8
+ LiteratureEvidenceRecord,
9
+ LiteratureQuerySpec,
10
+ LiteratureSupportSummary,
11
+ PaperMetadata,
12
+ PaperCardResult,
13
+ PaperSource,
14
+ PolymerDataPoint,
15
+ QueryMode,
16
+ ReviewStatus,
17
+ )
18
+ from .property_registry import (
19
+ PROPERTY_CATALOG,
20
+ PLATFORM_PROPERTY_KEYS,
21
+ TEMPLATES,
22
+ TEMPLATE_LABELS,
23
+ build_extraction_prompt,
24
+ detect_property_keys,
25
+ normalize_property_key,
26
+ property_display_name,
27
+ )
28
+ from .quality import QualityAssessor, QualityReport
29
+ from .standardizer import StandardizationResult, UnitStandardizer, normalize_minus_signs
30
+ from .clarifier import ClarifierAgent, QueryAnalysis
31
+ from .evaluation import evaluate_predictions, load_json_records
32
+
33
+ try:
34
+ from .config import LiteratureConfig, get_config
35
+ except Exception: # pragma: no cover - optional runtime dependency
36
+ LiteratureConfig = None # type: ignore
37
+ get_config = None # type: ignore
38
+
39
+ try:
40
+ from .discovery import PaperDiscoveryAgent
41
+ except Exception: # pragma: no cover - optional runtime dependency
42
+ PaperDiscoveryAgent = None # type: ignore
43
+
44
+ try:
45
+ from .retrieval import PDFRetriever, extract_text_from_pdf
46
+ except Exception: # pragma: no cover - optional runtime dependency
47
+ PDFRetriever = None # type: ignore
48
+ extract_text_from_pdf = None # type: ignore
49
+
50
+ try:
51
+ from .extraction import ContextualizedExtractor, DataExtractor
52
+ except Exception: # pragma: no cover - optional runtime dependency
53
+ ContextualizedExtractor = None # type: ignore
54
+ DataExtractor = None # type: ignore
55
+
56
+ try:
57
+ from .converters import to_experiment_result
58
+ except Exception: # pragma: no cover - optional runtime dependency
59
+ to_experiment_result = None # type: ignore
60
+
61
+ __all__ = [
62
+ "LiteratureConfig",
63
+ "get_config",
64
+ "PaperMetadata",
65
+ "PaperSource",
66
+ "PolymerDataPoint",
67
+ "ExtractionResult",
68
+ "DataQuality",
69
+ "ContextualizedValue",
70
+ "ExperimentalConditions",
71
+ "LiteratureQuerySpec",
72
+ "PaperCardResult",
73
+ "LiteratureEvidenceRecord",
74
+ "LiteratureSupportSummary",
75
+ "QueryMode",
76
+ "ReviewStatus",
77
+ "PaperDiscoveryAgent",
78
+ "PDFRetriever",
79
+ "extract_text_from_pdf",
80
+ "DataExtractor",
81
+ "ContextualizedExtractor",
82
+ "QualityAssessor",
83
+ "QualityReport",
84
+ "UnitStandardizer",
85
+ "normalize_minus_signs",
86
+ "StandardizationResult",
87
+ "ClarifierAgent",
88
+ "QueryAnalysis",
89
+ "evaluate_predictions",
90
+ "load_json_records",
91
+ "to_experiment_result",
92
+ "PROPERTY_CATALOG",
93
+ "PLATFORM_PROPERTY_KEYS",
94
+ "TEMPLATES",
95
+ "TEMPLATE_LABELS",
96
+ "build_extraction_prompt",
97
+ "detect_property_keys",
98
+ "normalize_property_key",
99
+ "property_display_name",
100
+ ]
literature/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (2 kB). View file
 
literature/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.42 kB). View file
 
literature/__pycache__/clarifier.cpython-310.pyc ADDED
Binary file (2.53 kB). View file
 
literature/__pycache__/clarifier.cpython-313.pyc ADDED
Binary file (3.13 kB). View file
 
literature/__pycache__/config.cpython-310.pyc ADDED
Binary file (2.75 kB). View file
 
literature/__pycache__/config.cpython-313.pyc ADDED
Binary file (3.39 kB). View file
 
literature/__pycache__/converters.cpython-310.pyc ADDED
Binary file (1.76 kB). View file
 
literature/__pycache__/converters.cpython-313.pyc ADDED
Binary file (2.57 kB). View file
 
literature/__pycache__/discovery.cpython-310.pyc ADDED
Binary file (10.8 kB). View file
 
literature/__pycache__/discovery.cpython-313.pyc ADDED
Binary file (16.4 kB). View file
 
literature/__pycache__/evaluation.cpython-310.pyc ADDED
Binary file (5.28 kB). View file
 
literature/__pycache__/extraction.cpython-310.pyc ADDED
Binary file (22.6 kB). View file
 
literature/__pycache__/extraction.cpython-313.pyc ADDED
Binary file (30.2 kB). View file
 
literature/__pycache__/graph.cpython-313.pyc ADDED
Binary file (15.6 kB). View file
 
literature/__pycache__/property_registry.cpython-310.pyc ADDED
Binary file (8.36 kB). View file
 
literature/__pycache__/property_registry.cpython-313.pyc ADDED
Binary file (6.76 kB). View file
 
literature/__pycache__/quality.cpython-310.pyc ADDED
Binary file (5.71 kB). View file
 
literature/__pycache__/quality.cpython-313.pyc ADDED
Binary file (7.55 kB). View file
 
literature/__pycache__/retrieval.cpython-310.pyc ADDED
Binary file (9.92 kB). View file
 
literature/__pycache__/retrieval.cpython-313.pyc ADDED
Binary file (16 kB). View file
 
literature/__pycache__/schemas.cpython-310.pyc ADDED
Binary file (12.2 kB). View file
 
literature/__pycache__/schemas.cpython-313.pyc ADDED
Binary file (11.6 kB). View file
 
literature/__pycache__/standardizer.cpython-310.pyc ADDED
Binary file (6.41 kB). View file
 
literature/__pycache__/standardizer.cpython-313.pyc ADDED
Binary file (8.84 kB). View file
 
literature/clarifier.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List
5
+
6
+ from .property_registry import detect_property_keys, property_display_name
7
+
8
+
9
+ POLYMER_KEYWORDS = {
10
+ "polymer",
11
+ "polyimide",
12
+ "peek",
13
+ "polyethylene",
14
+ "pedot",
15
+ "pedot:pss",
16
+ "p3ht",
17
+ "smiles",
18
+ }
19
+
20
+ CONDITION_KEYWORDS = {
21
+ "anneal",
22
+ "annealing",
23
+ "solvent",
24
+ "dopant",
25
+ "doping",
26
+ "spin coat",
27
+ "temperature",
28
+ "thickness",
29
+ "pressure",
30
+ "humidity",
31
+ "method",
32
+ }
33
+
34
+
35
+ @dataclass
36
+ class QueryAnalysis:
37
+ original_query: str
38
+ detected_polymers: List[str]
39
+ detected_properties: List[str]
40
+ detected_conditions: List[str]
41
+ suggestions: List[str]
42
+ clarification_required: bool
43
+ status: str
44
+
45
+ def to_payload(self) -> Dict[str, object]:
46
+ return {
47
+ "original_query": self.original_query,
48
+ "detected_polymers": self.detected_polymers,
49
+ "detected_properties": self.detected_properties,
50
+ "detected_conditions": self.detected_conditions,
51
+ "suggestions": self.suggestions,
52
+ "clarification_required": self.clarification_required,
53
+ "status": self.status,
54
+ }
55
+
56
+
57
+ class ClarifierAgent:
58
+ """
59
+ Lightweight clarifier for production search flows.
60
+ It nudges users toward material + property + condition context without
61
+ blocking valid free-form task queries.
62
+ """
63
+
64
+ def analyze(self, query: str) -> QueryAnalysis:
65
+ q = (query or "").lower()
66
+ polymers = [keyword for keyword in POLYMER_KEYWORDS if keyword in q]
67
+ properties = detect_property_keys(query or "")
68
+ conditions = [keyword for keyword in CONDITION_KEYWORDS if keyword in q]
69
+
70
+ suggestions: List[str] = []
71
+ if not polymers:
72
+ suggestions.append("Add a target polymer or material name.")
73
+ if not properties:
74
+ suggestions.append("Specify a key property focus, e.g. " + property_display_name("tg") + ".")
75
+ if not conditions:
76
+ suggestions.append("Add one processing or measurement condition if available.")
77
+
78
+ clarification_required = (not polymers) and (not properties)
79
+ status = "pending_clarification" if clarification_required else "ready"
80
+
81
+ return QueryAnalysis(
82
+ original_query=query,
83
+ detected_polymers=polymers,
84
+ detected_properties=properties,
85
+ detected_conditions=conditions,
86
+ suggestions=suggestions,
87
+ clarification_required=clarification_required,
88
+ status=status,
89
+ )
literature/config.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration management for Literature Discovery module.
3
+ Uses pydantic-settings for environment variable loading.
4
+ """
5
+ from typing import Optional, List
6
+ from pydantic import Field
7
+ from pydantic_settings import BaseSettings
8
+ from functools import lru_cache
9
+
10
+
11
+ class LiteratureConfig(BaseSettings):
12
+ """Literature mining configuration."""
13
+
14
+ # API Keys
15
+ pubmed_email: str = Field(default="scholar@university.edu", alias="PUBMED_EMAIL")
16
+ pubmed_api_key: Optional[str] = Field(default=None, alias="PUBMED_API_KEY")
17
+ semantic_scholar_api_key: Optional[str] = Field(default=None, alias="SEMANTIC_SCHOLAR_API_KEY")
18
+ gemini_api_key: Optional[str] = Field(default=None, alias="GEMINI_API_KEY")
19
+ openai_api_key: Optional[str] = Field(default=None, alias="MY_OPEN_WEBUI_API_KEY")
20
+ openai_base_url: Optional[str] = Field(default=None, alias="OPENAI_BASE_URL")
21
+ pageindex_api_key: Optional[str] = Field(default=None, alias="PAGEINDEX_API_KEY")
22
+
23
+ # LLM Configuration
24
+ llm_model: str = Field(default="gemini/gemini-2.0-flash", alias="LLM_MODEL")
25
+ embedding_model: str = Field(default="gemini/text-embedding-004")
26
+ llm_temperature: float = Field(default=0.1, ge=0.0, le=1.0)
27
+ llm_max_tokens: int = Field(default=4096)
28
+
29
+ # Search Configuration
30
+ default_search_limit: int = Field(default=20)
31
+ pubmed_enabled: bool = Field(default=True)
32
+ arxiv_enabled: bool = Field(default=True)
33
+ semantic_scholar_enabled: bool = Field(default=True) # Now enabled
34
+
35
+ # Rate Limiting (Semantic Scholar: 1 req/sec)
36
+ semantic_scholar_delay_s: float = Field(default=1.5) # Slightly over 1s for safety
37
+ pubmed_delay_s: float = Field(default=0.5)
38
+
39
+ # Storage
40
+ pdf_storage_dir: str = Field(default="data/literature/raw_pdfs")
41
+ database_path: str = Field(default="data/literature/papers.db")
42
+
43
+ # Processing
44
+ max_concurrent_downloads: int = Field(default=3)
45
+ extraction_timeout_s: int = Field(default=120)
46
+
47
+ # PDF Download Headers (for avoiding 403)
48
+ user_agent: str = Field(
49
+ default="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
50
+ )
51
+
52
+ # Target Polymers (for focused search)
53
+ target_polymers: List[str] = Field(
54
+ default=["PEDOT:PSS", "P3HT", "PBTTT", "P(NDI2OD-T2)", "PDPP-4T"]
55
+ )
56
+
57
+ # Extraction strategy: "paperqa" or "simple"
58
+ extraction_strategy: str = Field(default="simple")
59
+ literature_model_options: str = Field(default="[]", alias="LITERATURE_MODEL_OPTIONS")
60
+
61
+ model_config = {
62
+ "env_file": ".env",
63
+ "env_file_encoding": "utf-8",
64
+ "extra": "ignore",
65
+ }
66
+
67
+
68
+ @lru_cache()
69
+ def get_config() -> LiteratureConfig:
70
+ """Get configuration singleton."""
71
+ return LiteratureConfig()
literature/converters.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data model converters.
3
+
4
+ This module is now schema-optional:
5
+ - If legacy `src.utils.schema` exists, returns (Experiment, Result) objects.
6
+ - Otherwise returns two plain dict payloads for compatibility.
7
+ """
8
+ import time
9
+ from typing import Any, Dict, Tuple
10
+
11
+ from .schemas import PolymerDataPoint
12
+
13
+ try:
14
+ from src.utils.schema import Experiment, Result # type: ignore
15
+ HAS_LEGACY_SCHEMA = True
16
+ except Exception:
17
+ Experiment = None # type: ignore
18
+ Result = None # type: ignore
19
+ HAS_LEGACY_SCHEMA = False
20
+
21
+
22
+ def to_experiment_result(dp: PolymerDataPoint) -> Tuple[Any, Any]:
23
+ exp_id = f"lit_{dp.source_paper_id}_{int(time.time() * 1000)}"
24
+ exp_payload: Dict[str, Any] = {
25
+ "id": exp_id,
26
+ "polymer_id": dp.polymer_name,
27
+ "concentration_mg_ml": dp.concentration_mg_ml or 0.0,
28
+ "spin_speed_rpm": dp.spin_speed_rpm or 0,
29
+ "annealing_temp_c": dp.annealing_temp_c or 0.0,
30
+ "annealing_time_min": dp.annealing_time_min or 0.0,
31
+ "status": "completed",
32
+ "metadata": {
33
+ "dopant": dp.dopant,
34
+ "dopant_ratio": dp.dopant_ratio,
35
+ "solvent": dp.solvent,
36
+ "source_paper_id": dp.source_paper_id,
37
+ "source_table": dp.source_table_or_figure,
38
+ "quality_tier": dp.quality_tier.value,
39
+ "extraction_confidence": dp.extraction_confidence,
40
+ "film_thickness_nm": dp.film_thickness_nm,
41
+ "seebeck_coefficient_uv_k": dp.seebeck_coefficient_uv_k,
42
+ "power_factor_uw_m_k2": dp.power_factor_uw_m_k2,
43
+ },
44
+ }
45
+ res_payload: Dict[str, Any] = {
46
+ "experiment_id": exp_id,
47
+ "ec_s_cm": dp.electrical_conductivity_s_cm or 0.0,
48
+ "tc_w_mk": dp.thermal_conductivity_w_mk,
49
+ "xrd_crystallinity": dp.xrd_crystallinity_percent,
50
+ "xrd_pi_stacking_angstrom": dp.xrd_pi_stacking_angstrom,
51
+ "source": "literature",
52
+ }
53
+
54
+ if HAS_LEGACY_SCHEMA:
55
+ return Experiment(**exp_payload), Result(**res_payload) # type: ignore
56
+ return exp_payload, res_payload
literature/discovery.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multi-source paper discovery module.
3
+ Implements PubMed, ArXiv, and Semantic Scholar search.
4
+ Uses synchronous code for MVP simplicity.
5
+ """
6
+ import logging
7
+ import time
8
+ from typing import List, Optional
9
+
10
+ import arxiv
11
+ from Bio import Entrez
12
+
13
+ from .schemas import PaperMetadata, PaperSource
14
+ from .config import get_config
15
+
16
+ logger = logging.getLogger(__name__)
17
+ _SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED = False
18
+
19
+
20
+ class ArxivSearcher:
21
+ """ArXiv paper searcher."""
22
+
23
+ def __init__(self) -> None:
24
+ self.client = arxiv.Client()
25
+
26
+ def search(self, query: str, limit: int = 10) -> List[PaperMetadata]:
27
+ """
28
+ Search ArXiv for papers.
29
+
30
+ Args:
31
+ query: Search query string
32
+ limit: Maximum number of results
33
+
34
+ Returns:
35
+ List of PaperMetadata objects
36
+ """
37
+ logger.info(f"Searching ArXiv: '{query}' (limit={limit})")
38
+
39
+ search = arxiv.Search(
40
+ query=query,
41
+ max_results=limit,
42
+ sort_by=arxiv.SortCriterion.Relevance
43
+ )
44
+
45
+ papers: List[PaperMetadata] = []
46
+ try:
47
+ for result in self.client.results(search):
48
+ # Extract arxiv ID without version
49
+ arxiv_id = result.entry_id.split('/')[-1].split('v')[0]
50
+
51
+ paper = PaperMetadata(
52
+ id=f"arxiv_{arxiv_id}",
53
+ title=result.title,
54
+ authors=[a.name for a in result.authors],
55
+ year=result.published.year if result.published else None,
56
+ doi=result.doi,
57
+ abstract=result.summary,
58
+ venue="arXiv",
59
+ citation_count=None,
60
+ is_open_access=True,
61
+ source=PaperSource.ARXIV,
62
+ url=result.entry_id,
63
+ landing_url=result.entry_id,
64
+ pdf_url=result.pdf_url,
65
+ )
66
+ papers.append(paper)
67
+ except Exception as e:
68
+ logger.error(f"ArXiv search failed: {e}")
69
+
70
+ logger.info(f"ArXiv returned {len(papers)} papers")
71
+ return papers
72
+
73
+
74
+ class PubMedSearcher:
75
+ """PubMed paper searcher using Biopython Entrez."""
76
+
77
+ def __init__(self) -> None:
78
+ config = get_config()
79
+ Entrez.email = config.pubmed_email
80
+ if config.pubmed_api_key:
81
+ Entrez.api_key = config.pubmed_api_key
82
+ self.delay = config.pubmed_delay_s
83
+
84
+ def search(self, query: str, limit: int = 10) -> List[PaperMetadata]:
85
+ """
86
+ Search PubMed for papers.
87
+
88
+ Args:
89
+ query: Search query string
90
+ limit: Maximum number of results
91
+
92
+ Returns:
93
+ List of PaperMetadata objects
94
+ """
95
+ logger.info(f"Searching PubMed: '{query}' (limit={limit})")
96
+
97
+ try:
98
+ # Step 1: Search for IDs
99
+ handle = Entrez.esearch(db="pubmed", term=query, retmax=limit)
100
+ record = Entrez.read(handle)
101
+ handle.close()
102
+
103
+ id_list = record.get("IdList", [])
104
+ if not id_list:
105
+ logger.info("PubMed returned 0 papers")
106
+ return []
107
+
108
+ time.sleep(self.delay)
109
+
110
+ # Step 2: Fetch details in XML format
111
+ handle = Entrez.efetch(
112
+ db="pubmed",
113
+ id=id_list,
114
+ rettype="xml",
115
+ retmode="xml"
116
+ )
117
+ records = Entrez.read(handle)
118
+ handle.close()
119
+
120
+ papers: List[PaperMetadata] = []
121
+ for article in records.get("PubmedArticle", []):
122
+ try:
123
+ paper = self._parse_pubmed_article(article)
124
+ if paper:
125
+ papers.append(paper)
126
+ except Exception as e:
127
+ logger.warning(f"Failed to parse PubMed article: {e}")
128
+
129
+ logger.info(f"PubMed returned {len(papers)} papers")
130
+ return papers
131
+
132
+ except Exception as e:
133
+ logger.error(f"PubMed search failed: {e}")
134
+ return []
135
+
136
+ def _parse_pubmed_article(self, article: dict) -> Optional[PaperMetadata]:
137
+ """Parse a single PubMed article into PaperMetadata."""
138
+ medline = article.get("MedlineCitation", {})
139
+ article_data = medline.get("Article", {})
140
+
141
+ # Extract PMID
142
+ pmid = str(medline.get("PMID", ""))
143
+ if not pmid:
144
+ return None
145
+
146
+ # Extract title
147
+ title = article_data.get("ArticleTitle", "Unknown Title")
148
+ if isinstance(title, list):
149
+ title = " ".join(str(t) for t in title)
150
+
151
+ # Extract authors
152
+ authors: List[str] = []
153
+ author_list = article_data.get("AuthorList", [])
154
+ for author in author_list:
155
+ if isinstance(author, dict):
156
+ last_name = author.get("LastName", "")
157
+ fore_name = author.get("ForeName", "")
158
+ if last_name:
159
+ authors.append(f"{fore_name} {last_name}".strip())
160
+
161
+ # Extract year
162
+ year = None
163
+ pub_date = article_data.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
164
+ if "Year" in pub_date:
165
+ try:
166
+ year = int(pub_date["Year"])
167
+ except (ValueError, TypeError):
168
+ pass
169
+
170
+ # Extract abstract
171
+ abstract = ""
172
+ abstract_text = article_data.get("Abstract", {}).get("AbstractText", [])
173
+ if isinstance(abstract_text, list):
174
+ abstract = " ".join(str(t) for t in abstract_text)
175
+ elif isinstance(abstract_text, str):
176
+ abstract = abstract_text
177
+
178
+ # Extract DOI
179
+ doi = None
180
+ id_list = article_data.get("ELocationID", [])
181
+ for eid in id_list:
182
+ if hasattr(eid, "attributes") and eid.attributes.get("EIdType") == "doi":
183
+ doi = str(eid)
184
+ break
185
+
186
+ journal = article_data.get("Journal", {})
187
+ journal_title = journal.get("Title")
188
+
189
+ return PaperMetadata(
190
+ id=f"pubmed_{pmid}",
191
+ title=str(title),
192
+ authors=authors,
193
+ year=year,
194
+ doi=doi,
195
+ abstract=abstract,
196
+ venue=str(journal_title) if journal_title else None,
197
+ citation_count=None,
198
+ is_open_access=None,
199
+ source=PaperSource.PUBMED,
200
+ url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
201
+ landing_url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
202
+ )
203
+
204
+
205
+ class SemanticScholarSearcher:
206
+ """Semantic Scholar paper searcher (with rate limiting)."""
207
+
208
+ def __init__(self) -> None:
209
+ config = get_config()
210
+ self.api_key = config.semantic_scholar_api_key
211
+ self.delay = config.semantic_scholar_delay_s
212
+
213
+ def search(self, query: str, limit: int = 10) -> List[PaperMetadata]:
214
+ """
215
+ Search Semantic Scholar for papers.
216
+ Rate limited to avoid 403 errors.
217
+
218
+ Args:
219
+ query: Search query string
220
+ limit: Maximum number of results
221
+
222
+ Returns:
223
+ List of PaperMetadata objects
224
+ """
225
+ logger.info(f"Searching Semantic Scholar: '{query}' (limit={limit})")
226
+
227
+ # Lazy import to avoid dependency issues
228
+ try:
229
+ from semanticscholar import SemanticScholar
230
+ except ImportError:
231
+ global _SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED
232
+ if not _SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED:
233
+ logger.debug("semanticscholar package not installed; Semantic Scholar source disabled.")
234
+ _SEMANTIC_SCHOLAR_IMPORT_MISSING_LOGGED = True
235
+ return []
236
+
237
+ time.sleep(self.delay) # Initial delay
238
+
239
+ try:
240
+ client = SemanticScholar(api_key=self.api_key)
241
+ results = client.search_paper(
242
+ query,
243
+ limit=limit,
244
+ fields=['title', 'abstract', 'authors', 'year', 'externalIds', 'url', 'isOpenAccess', 'openAccessPdf', 'venue', 'citationCount']
245
+ )
246
+
247
+ papers: List[PaperMetadata] = []
248
+ for item in results:
249
+ if len(papers) >= limit:
250
+ break
251
+
252
+ # Get PDF URL if available
253
+ pdf_url = None
254
+ if item.openAccessPdf and isinstance(item.openAccessPdf, dict):
255
+ pdf_url = item.openAccessPdf.get('url')
256
+
257
+ paper = PaperMetadata(
258
+ id=f"s2_{item.paperId}",
259
+ title=item.title or "Unknown",
260
+ authors=[a.name for a in (item.authors or [])],
261
+ year=item.year,
262
+ doi=item.externalIds.get("DOI") if item.externalIds else None,
263
+ abstract=item.abstract,
264
+ venue=getattr(item, "venue", None),
265
+ citation_count=getattr(item, "citationCount", None),
266
+ is_open_access=bool(getattr(item, "isOpenAccess", False)),
267
+ source=PaperSource.SEMANTIC_SCHOLAR,
268
+ url=item.url,
269
+ landing_url=item.url,
270
+ pdf_url=pdf_url,
271
+ )
272
+ papers.append(paper)
273
+ time.sleep(self.delay) # Rate limit between items
274
+
275
+ logger.info(f"Semantic Scholar returned {len(papers)} papers")
276
+ return papers
277
+
278
+ except Exception as e:
279
+ logger.warning(f"Semantic Scholar search failed (likely 403): {e}")
280
+ return []
281
+
282
+
283
+ class PaperDiscoveryAgent:
284
+ """
285
+ Paper discovery agent.
286
+ Aggregates multiple search sources, deduplicates, and sorts results.
287
+ """
288
+
289
+ def __init__(self) -> None:
290
+ config = get_config()
291
+ self.searchers: List[tuple] = []
292
+
293
+ if config.arxiv_enabled:
294
+ self.searchers.append(("arxiv", ArxivSearcher()))
295
+ if config.pubmed_enabled:
296
+ self.searchers.append(("pubmed", PubMedSearcher()))
297
+ if config.semantic_scholar_enabled:
298
+ self.searchers.append(("semantic_scholar", SemanticScholarSearcher()))
299
+
300
+ logger.info(f"Initialized PaperDiscoveryAgent with sources: {[s[0] for s in self.searchers]}")
301
+
302
+ def discover(
303
+ self,
304
+ query: str,
305
+ limit_per_source: int = 10,
306
+ deduplicate: bool = True
307
+ ) -> List[PaperMetadata]:
308
+ """
309
+ Search all sources and aggregate results.
310
+
311
+ Args:
312
+ query: Search query
313
+ limit_per_source: Maximum results per source
314
+ deduplicate: Whether to deduplicate by title
315
+
316
+ Returns:
317
+ Aggregated list of papers
318
+ """
319
+ all_papers: List[PaperMetadata] = []
320
+
321
+ for source_name, searcher in self.searchers:
322
+ try:
323
+ papers = searcher.search(query, limit_per_source)
324
+ all_papers.extend(papers)
325
+ logger.info(f"{source_name} returned {len(papers)} papers")
326
+ except Exception as e:
327
+ logger.error(f"Search failed for {source_name}: {e}")
328
+
329
+ logger.info(f"Total papers before deduplication: {len(all_papers)}")
330
+
331
+ if deduplicate:
332
+ all_papers = self._deduplicate(all_papers)
333
+ logger.info(f"Total papers after deduplication: {len(all_papers)}")
334
+
335
+ return all_papers
336
+
337
+ def _deduplicate(self, papers: List[PaperMetadata]) -> List[PaperMetadata]:
338
+ """Deduplicate papers by normalized title."""
339
+ seen_titles: set = set()
340
+ unique_papers: List[PaperMetadata] = []
341
+
342
+ for paper in papers:
343
+ # Normalize title for comparison
344
+ normalized = paper.title.lower().strip()
345
+ if normalized not in seen_titles:
346
+ seen_titles.add(normalized)
347
+ unique_papers.append(paper)
348
+
349
+ return unique_papers
350
+
351
+ def build_thermoelectric_query(
352
+ self,
353
+ polymer: Optional[str] = None,
354
+ include_tc: bool = True
355
+ ) -> str:
356
+ """
357
+ Build a specialized thermoelectric search query.
358
+
359
+ Args:
360
+ polymer: Specific polymer name (e.g., "P3HT")
361
+ include_tc: Whether to include thermal conductivity keywords
362
+
363
+ Returns:
364
+ Optimized search query string
365
+ """
366
+ base_terms = [
367
+ "organic thermoelectric",
368
+ "conjugated polymer",
369
+ "electrical conductivity",
370
+ ]
371
+
372
+ if include_tc:
373
+ base_terms.append("thermal conductivity")
374
+
375
+ if polymer:
376
+ base_terms.insert(0, polymer)
377
+
378
+ query = " ".join(base_terms)
379
+ logger.debug(f"Built query: {query}")
380
+ return query
literature/evaluation.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Offline evaluation helpers for structured literature extraction.
3
+
4
+ The harness is intentionally dataset-agnostic so POLYIE-formatted exports and
5
+ internal regression sets can share the same metric implementation.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import Any, Dict, Iterable, List, Sequence, Tuple
12
+
13
+ from .property_registry import normalize_property_key
14
+
15
+
16
+ CORE_FIELDS = ["material_name", "property_key", "raw_value", "raw_unit", "method"]
17
+
18
+
19
+ def load_json_records(path: str | Path) -> List[Dict[str, Any]]:
20
+ fp = Path(path)
21
+ if fp.suffix == ".jsonl":
22
+ return [json.loads(line) for line in fp.read_text(encoding="utf-8").splitlines() if line.strip()]
23
+ data = json.loads(fp.read_text(encoding="utf-8"))
24
+ if isinstance(data, list):
25
+ return data
26
+ raise ValueError(f"Unsupported evaluation file format: {fp}")
27
+
28
+
29
+ def normalize_record(record: Dict[str, Any]) -> Dict[str, Any]:
30
+ material = str(
31
+ record.get("material_name")
32
+ or record.get("polymer_name")
33
+ or record.get("material")
34
+ or ""
35
+ ).strip()
36
+ property_key = normalize_property_key(
37
+ str(record.get("property_key") or record.get("property_name") or "")
38
+ ) or str(record.get("property_key") or record.get("property_name") or "").strip()
39
+ raw_value = str(record.get("raw_value") or record.get("value") or "").strip()
40
+ raw_unit = str(record.get("raw_unit") or record.get("unit") or "").strip()
41
+ method = str(record.get("method") or record.get("measurement_method") or "").strip()
42
+ evidence_quote = str(record.get("evidence_quote") or record.get("source_quote") or "").strip()
43
+ return {
44
+ "material_name": material,
45
+ "property_key": property_key,
46
+ "raw_value": raw_value,
47
+ "raw_unit": raw_unit,
48
+ "method": method,
49
+ "evidence_quote": evidence_quote,
50
+ }
51
+
52
+
53
+ def _safe_div(numerator: float, denominator: float) -> float:
54
+ return numerator / denominator if denominator else 0.0
55
+
56
+
57
+ def _f1(precision: float, recall: float) -> float:
58
+ return (2 * precision * recall) / (precision + recall) if (precision + recall) else 0.0
59
+
60
+
61
+ def _field_pairs(records: Sequence[Dict[str, Any]], field: str) -> set[Tuple[str, str]]:
62
+ pairs = set()
63
+ for record in records:
64
+ normalized = normalize_record(record)
65
+ key = normalized.get("material_name", "")
66
+ value = normalized.get(field, "")
67
+ if key and value:
68
+ pairs.add((key.lower(), value.lower()))
69
+ return pairs
70
+
71
+
72
+ def _relation_tuples(records: Sequence[Dict[str, Any]]) -> set[Tuple[str, str, str]]:
73
+ triples = set()
74
+ for record in records:
75
+ normalized = normalize_record(record)
76
+ if normalized["material_name"] and normalized["property_key"] and normalized["raw_value"]:
77
+ triples.add(
78
+ (
79
+ normalized["material_name"].lower(),
80
+ normalized["property_key"].lower(),
81
+ normalized["raw_value"].lower(),
82
+ )
83
+ )
84
+ return triples
85
+
86
+
87
+ def _record_tuples(records: Sequence[Dict[str, Any]]) -> set[Tuple[str, str, str, str, str]]:
88
+ tuples = set()
89
+ for record in records:
90
+ normalized = normalize_record(record)
91
+ tuples.add(
92
+ tuple(normalized[field].lower() for field in CORE_FIELDS)
93
+ )
94
+ return tuples
95
+
96
+
97
+ def evaluate_predictions(
98
+ gold_records: Sequence[Dict[str, Any]],
99
+ predicted_records: Sequence[Dict[str, Any]],
100
+ ) -> Dict[str, Any]:
101
+ gold = [normalize_record(record) for record in gold_records]
102
+ predicted = [normalize_record(record) for record in predicted_records]
103
+
104
+ field_metrics: Dict[str, Dict[str, float]] = {}
105
+ for field in CORE_FIELDS:
106
+ gold_pairs = _field_pairs(gold, field)
107
+ predicted_pairs = _field_pairs(predicted, field)
108
+ tp = len(gold_pairs & predicted_pairs)
109
+ precision = _safe_div(tp, len(predicted_pairs))
110
+ recall = _safe_div(tp, len(gold_pairs))
111
+ field_metrics[field] = {
112
+ "precision": precision,
113
+ "recall": recall,
114
+ "f1": _f1(precision, recall),
115
+ }
116
+
117
+ gold_rel = _relation_tuples(gold)
118
+ pred_rel = _relation_tuples(predicted)
119
+ rel_tp = len(gold_rel & pred_rel)
120
+ rel_precision = _safe_div(rel_tp, len(pred_rel))
121
+ rel_recall = _safe_div(rel_tp, len(gold_rel))
122
+
123
+ gold_records_set = _record_tuples(gold)
124
+ pred_records_set = _record_tuples(predicted)
125
+ record_tp = len(gold_records_set & pred_records_set)
126
+ record_precision = _safe_div(record_tp, len(pred_records_set))
127
+ record_recall = _safe_div(record_tp, len(gold_records_set))
128
+
129
+ filled_fields = [
130
+ sum(1 for field in CORE_FIELDS if record.get(field))
131
+ for record in predicted
132
+ ]
133
+ record_completeness = _safe_div(sum(filled_fields), len(predicted) * len(CORE_FIELDS))
134
+ source_grounding_hit_rate = _safe_div(
135
+ sum(1 for record in predicted if record.get("evidence_quote")),
136
+ len(predicted),
137
+ )
138
+
139
+ return {
140
+ "field_metrics": field_metrics,
141
+ "relation_level": {
142
+ "precision": rel_precision,
143
+ "recall": rel_recall,
144
+ "f1": _f1(rel_precision, rel_recall),
145
+ },
146
+ "record_level": {
147
+ "precision": record_precision,
148
+ "recall": record_recall,
149
+ "f1": _f1(record_precision, record_recall),
150
+ },
151
+ "record_completeness": record_completeness,
152
+ "source_grounding_hit_rate": source_grounding_hit_rate,
153
+ "gold_count": len(gold),
154
+ "predicted_count": len(predicted),
155
+ }
literature/extraction.py ADDED
@@ -0,0 +1,863 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM-based structured data extraction module.
3
+ Implements flexible interface: PageIndex (RAG via indexed PDFs) or Simple extraction (fallback).
4
+
5
+ Prompts are dynamically built from user-selected target properties via
6
+ ``literature.property_registry.build_extraction_prompt``.
7
+ """
8
+ import json
9
+ import re
10
+ import logging
11
+ import os
12
+ from typing import List, Optional, Any
13
+ from datetime import datetime
14
+
15
+ from .schemas import (
16
+ PaperMetadata,
17
+ PolymerDataPoint,
18
+ ExtractionResult,
19
+ DataQuality
20
+ )
21
+ from .config import get_config
22
+ from .retrieval import extract_text_from_pdf
23
+ from .property_registry import PROPERTY_CATALOG, build_extraction_prompt, TEMPLATES
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Default property set used when no explicit target properties are provided.
28
+ # The legacy thermoelectric-only template no longer exists in the production
29
+ # registry, so fall back to the platform-wide property core.
30
+ _DEFAULT_PROPERTIES = TEMPLATES.get("platform_core") or list(PROPERTY_CATALOG.keys())
31
+
32
+ _SKIP_ERROR_MESSAGES = {
33
+ "llm_unconfigured",
34
+ "contextual_llm_unconfigured",
35
+ "extraction_backend_unconfigured",
36
+ "pageindex_requires_pdf_no_simple_backend",
37
+ "pageindex_sdk_unavailable_no_simple_backend",
38
+ }
39
+
40
+
41
+ def _normalize_base_url(url: Optional[str]) -> Optional[str]:
42
+ text = str(url or "").strip().rstrip("/")
43
+ return text or None
44
+
45
+
46
+ def _is_http_url(url: Optional[str]) -> bool:
47
+ text = _normalize_base_url(url)
48
+ return bool(text and (text.startswith("http://") or text.startswith("https://")))
49
+
50
+
51
+ def is_expected_skip_error(error_message: Optional[str]) -> bool:
52
+ return str(error_message or "").strip() in _SKIP_ERROR_MESSAGES
53
+
54
+
55
+ # ============== JSON Safe Parsing (Fix Logic Bug #4 & #5) ==============
56
+
57
+ def normalize_minus_signs(s: str) -> str:
58
+ """
59
+ Normalize all types of minus signs to ASCII minus.
60
+
61
+ Fixes Logic Bug #5: OCR may produce Unicode minus (U+2212) instead of ASCII.
62
+ """
63
+ minus_chars = [
64
+ '−', # U+2212 MINUS SIGN
65
+ '–', # U+2013 EN DASH
66
+ '—', # U+2014 EM DASH
67
+ '‐', # U+2010 HYPHEN
68
+ '‑', # U+2011 NON-BREAKING HYPHEN
69
+ '‒', # U+2012 FIGURE DASH
70
+ '⁻', # U+207B SUPERSCRIPT MINUS
71
+ '₋', # U+208B SUBSCRIPT MINUS
72
+ ]
73
+ for char in minus_chars:
74
+ s = s.replace(char, '-')
75
+ return s
76
+
77
+
78
+ def safe_json_loads(text: str) -> Any:
79
+ """
80
+ Safely parse JSON, handling common LLM output issues.
81
+
82
+ Fixes Logic Bug #4: LLM may return NaN, Infinity, Python-style None, trailing commas.
83
+ """
84
+ if not text:
85
+ return None
86
+
87
+ text = text.strip()
88
+
89
+ # Extract JSON from markdown code blocks
90
+ if "```json" in text:
91
+ text = text.split("```json")[1].split("```")[0]
92
+ elif "```" in text:
93
+ parts = text.split("```")
94
+ if len(parts) >= 2:
95
+ text = parts[1]
96
+
97
+ # Normalize minus signs
98
+ text = normalize_minus_signs(text)
99
+
100
+ # Fix Python-style -> JSON-style
101
+ text = re.sub(r'\bNone\b', 'null', text)
102
+ text = re.sub(r'\bTrue\b', 'true', text)
103
+ text = re.sub(r'\bFalse\b', 'false', text)
104
+
105
+ # Remove trailing commas
106
+ text = re.sub(r',\s*}', '}', text)
107
+ text = re.sub(r',\s*]', ']', text)
108
+
109
+ # Handle NaN and Infinity
110
+ text = re.sub(r'\bNaN\b', 'null', text)
111
+ text = re.sub(r'\bInfinity\b', 'null', text)
112
+ text = re.sub(r'-Infinity\b', 'null', text)
113
+
114
+ try:
115
+ return json.loads(text)
116
+ except json.JSONDecodeError as e:
117
+ logger.warning(f"Initial JSON parse failed: {e}")
118
+
119
+ # Try json_repair if available
120
+ try:
121
+ from json_repair import repair_json
122
+ repaired = repair_json(text)
123
+ return json.loads(repaired)
124
+ except ImportError:
125
+ logger.warning("json_repair not installed, cannot repair JSON")
126
+ raise
127
+ except Exception as e2:
128
+ logger.error(f"JSON repair also failed: {e2}")
129
+ raise
130
+
131
+
132
+
133
+
134
+ # Extraction prompt template
135
+ EXTRACTION_PROMPT = """
136
+ You are an expert in organic thermoelectrics and polymer science.
137
+ Your task is to extract ALL experimental data points from the provided paper.
138
+
139
+ ## Target Data
140
+ Extract data for conjugated polymers used in thermoelectric applications, including:
141
+ - PEDOT:PSS, P3HT, PBTTT, P(NDI2OD-T2), PDPP series, etc.
142
+
143
+ ## Required Fields (extract as many as available)
144
+ For EACH data point, extract:
145
+
146
+ ### Material Information
147
+ - polymer_name: The polymer name/abbreviation (e.g., "P3HT", "PEDOT:PSS")
148
+ - dopant: Dopant used (e.g., "DMSO", "H2SO4", "FeCl3")
149
+ - dopant_ratio: Dopant concentration if specified (e.g., "5 wt%", "1 M")
150
+
151
+ ### Processing Conditions
152
+ - solvent: Solvent used for film preparation
153
+ - concentration_mg_ml: Solution concentration in mg/mL
154
+ - spin_speed_rpm: Spin coating speed in RPM
155
+ - spin_time_s: Spin coating time in seconds
156
+ - annealing_temp_c: Annealing temperature in Celsius
157
+ - annealing_time_min: Annealing time in minutes
158
+ - annealing_atmosphere: Atmosphere during annealing (N2, Air, Vacuum)
159
+ - film_thickness_nm: Film thickness in nanometers
160
+
161
+ ### Electrical Properties
162
+ - electrical_conductivity_s_cm: Electrical conductivity in S/cm
163
+ - seebeck_coefficient_uv_k: Seebeck coefficient in μV/K
164
+ - power_factor_uw_m_k2: Power factor in μW/(m·K²)
165
+
166
+ ### Thermal Properties (IMPORTANT - often sparse)
167
+ - thermal_conductivity_w_mk: Thermal conductivity in W/(m·K)
168
+ - zt_figure_of_merit: ZT figure of merit (dimensionless)
169
+
170
+ ### Structural Characterization
171
+ - xrd_crystallinity_percent: Crystallinity percentage from XRD
172
+ - xrd_pi_stacking_angstrom: π-π stacking distance in Angstrom
173
+ - xrd_lamellar_spacing_angstrom: Lamellar spacing in Angstrom
174
+
175
+ ### Metadata
176
+ - source_table_or_figure: Where the data was found (e.g., "Table 1", "Figure 3")
177
+ - extraction_confidence: Your confidence in this extraction (0.0 to 1.0)
178
+
179
+ ## CRITICAL Rules
180
+ 1. Extract ONLY experimentally measured values, not theoretical predictions
181
+ 2. Convert all units to the specified standard units
182
+ 3. If a value range is given (e.g., "100-200 S/cm"), use the AVERAGE
183
+ 4. If a value is "not measured" or "N/A", use null
184
+ 5. Each row in a table = one data point
185
+ 6. Include the source_table_or_figure for traceability
186
+
187
+ ## Output Format
188
+ Return a valid JSON array. Example:
189
+ [
190
+ {
191
+ "polymer_name": "PEDOT:PSS",
192
+ "dopant": "H2SO4",
193
+ "dopant_ratio": "5 vol%",
194
+ "electrical_conductivity_s_cm": 1200.5,
195
+ "thermal_conductivity_w_mk": 0.35,
196
+ "source_table_or_figure": "Table 2",
197
+ "extraction_confidence": 0.9
198
+ }
199
+ ]
200
+
201
+ Return ONLY the JSON array, no markdown formatting, no explanations.
202
+ If no relevant data is found, return an empty array: []
203
+ """
204
+
205
+
206
+ class DataExtractor:
207
+ """
208
+ Flexible data extractor with fallback strategy.
209
+
210
+ Primary: PageIndex (RAG via indexed PDFs)
211
+ Fallback: Simple extraction (pymupdf + direct LLM)
212
+ """
213
+
214
+ def __init__(
215
+ self,
216
+ strategy: Optional[str] = None,
217
+ target_properties: Optional[List[str]] = None,
218
+ extra_instructions: str = "",
219
+ ) -> None:
220
+ config = get_config()
221
+ self.strategy = strategy or config.extraction_strategy
222
+ self.llm_model = config.llm_model
223
+ self.gemini_key = config.gemini_api_key
224
+ self.openai_key = config.openai_api_key
225
+ self.openai_base_url = _normalize_base_url(config.openai_base_url)
226
+ self.pdf_dir = config.pdf_storage_dir
227
+ self.pageindex_api_key = config.pageindex_api_key
228
+ self.target_properties = target_properties or _DEFAULT_PROPERTIES
229
+ self.extra_instructions = extra_instructions
230
+
231
+ logger.info(f"Initialized DataExtractor with strategy: {self.strategy}, properties: {self.target_properties}")
232
+
233
+ def has_openai_backend(self) -> bool:
234
+ return _is_http_url(self.openai_base_url)
235
+
236
+ def has_any_llm_backend(self) -> bool:
237
+ return self.has_openai_backend() or bool(str(self.gemini_key or "").strip())
238
+
239
+ def has_pageindex_backend(self) -> bool:
240
+ return bool(str(self.pageindex_api_key or "").strip())
241
+
242
+ def can_attempt_extraction(self) -> bool:
243
+ return self.has_pageindex_backend() or self.has_any_llm_backend()
244
+
245
+ def availability_reason(self) -> Optional[str]:
246
+ if self.can_attempt_extraction():
247
+ return None
248
+ return "Structured extraction skipped: configure PAGEINDEX_API_KEY or a valid LLM backend."
249
+
250
+ def extract_from_papers(
251
+ self,
252
+ papers: List[PaperMetadata],
253
+ use_full_text: bool = True
254
+ ) -> List[ExtractionResult]:
255
+ """
256
+ Extract data from multiple papers.
257
+
258
+ Args:
259
+ papers: List of paper metadata (with pdf_path if available)
260
+ use_full_text: Use PDF full text if available
261
+
262
+ Returns:
263
+ List of extraction results
264
+ """
265
+ results: List[ExtractionResult] = []
266
+
267
+ for paper in papers:
268
+ try:
269
+ if self.strategy == "pageindex":
270
+ result = self._extract_with_pageindex(paper)
271
+ else:
272
+ result = self._extract_simple(paper, use_full_text)
273
+ results.append(result)
274
+ except Exception as e:
275
+ logger.error(f"Extraction failed for {paper.id}: {e}")
276
+ results.append(ExtractionResult(
277
+ paper=paper,
278
+ success=False,
279
+ error_message=str(e)
280
+ ))
281
+
282
+ return results
283
+
284
+ def _extract_simple(
285
+ self,
286
+ paper: PaperMetadata,
287
+ use_full_text: bool = True
288
+ ) -> ExtractionResult:
289
+ """
290
+ Simple extraction: Extract PDF text -> Feed to LLM -> Parse JSON.
291
+ Often more effective for metadata extraction.
292
+ """
293
+ logger.info(f"Simple extraction for: {paper.title[:50]}...")
294
+
295
+ # Get content
296
+ content = self._prepare_content(paper, use_full_text)
297
+ if not content:
298
+ return ExtractionResult(
299
+ paper=paper,
300
+ success=False,
301
+ error_message="No content available"
302
+ )
303
+
304
+ if not self.has_any_llm_backend():
305
+ return ExtractionResult(
306
+ paper=paper,
307
+ success=False,
308
+ error_message="llm_unconfigured",
309
+ extraction_notes="Simple extraction skipped because no LLM backend is configured.",
310
+ )
311
+
312
+ # Call LLM with dynamic prompt
313
+ dynamic_prompt = build_extraction_prompt(self.target_properties, self.extra_instructions)
314
+ prompt = dynamic_prompt.replace("{title}", paper.title or "Unknown").replace("{content}", content)
315
+
316
+ try:
317
+ raw_response = self._call_llm(prompt)
318
+
319
+ if not raw_response:
320
+ return ExtractionResult(
321
+ paper=paper,
322
+ success=False,
323
+ error_message="LLM returned empty response"
324
+ )
325
+
326
+ # Parse response
327
+ data_points = self._parse_llm_output(raw_response, paper.id)
328
+
329
+ # Assess quality for each point
330
+ for dp in data_points:
331
+ dp.quality_tier = self._assess_quality(dp)
332
+
333
+ return ExtractionResult(
334
+ paper=paper,
335
+ data_points=data_points,
336
+ llm_model_used=self.llm_model,
337
+ extraction_timestamp=datetime.now(),
338
+ success=True
339
+ )
340
+
341
+ except Exception as e:
342
+ logger.error(f"Simple extraction failed: {e}")
343
+ return ExtractionResult(
344
+ paper=paper,
345
+ success=False,
346
+ error_message=str(e)
347
+ )
348
+
349
+ def _extract_with_pageindex(self, paper: PaperMetadata) -> ExtractionResult:
350
+ """
351
+ PageIndex extraction (RAG-enhanced via indexed PDF).
352
+ Submits PDF to PageIndex, then uses chat_completions with extraction prompt.
353
+ Falls back to simple extraction if PageIndex is unavailable or fails.
354
+ """
355
+ if not self.has_pageindex_backend():
356
+ if self.has_any_llm_backend():
357
+ return self._extract_simple(paper)
358
+ return ExtractionResult(
359
+ paper=paper,
360
+ success=False,
361
+ error_message="extraction_backend_unconfigured",
362
+ extraction_notes="No PageIndex or LLM backend is configured.",
363
+ )
364
+
365
+ if not paper.pdf_path or not os.path.exists(paper.pdf_path):
366
+ if self.has_any_llm_backend():
367
+ return self._extract_simple(paper)
368
+ return ExtractionResult(
369
+ paper=paper,
370
+ success=False,
371
+ error_message="pageindex_requires_pdf_no_simple_backend",
372
+ extraction_notes="PageIndex extraction requires a PDF when no simple LLM fallback is available.",
373
+ )
374
+
375
+ try:
376
+ from src.literature_service.pageindex_client import PageIndexService
377
+ except ImportError:
378
+ if self.has_any_llm_backend():
379
+ return self._extract_simple(paper)
380
+ return ExtractionResult(
381
+ paper=paper,
382
+ success=False,
383
+ error_message="pageindex_sdk_unavailable_no_simple_backend",
384
+ extraction_notes="PageIndex SDK unavailable and no simple LLM fallback is configured.",
385
+ )
386
+
387
+ logger.info(f"PageIndex extraction for: {paper.title[:50]}...")
388
+
389
+ try:
390
+ service = PageIndexService(api_key=self.pageindex_api_key)
391
+
392
+ # Submit the document to PageIndex
393
+ doc_id = service.submit_document(paper.pdf_path)
394
+ logger.info(f"Submitted to PageIndex, doc_id={doc_id}")
395
+
396
+ # Wait for indexing to complete (poll status)
397
+ import time
398
+ for _ in range(30): # max ~60 seconds
399
+ status = service.get_document_status(doc_id)
400
+ if status == "completed":
401
+ break
402
+ if status in ("error", "failed"):
403
+ raise RuntimeError(f"PageIndex indexing failed with status: {status}")
404
+ time.sleep(2)
405
+ else:
406
+ logger.warning("PageIndex indexing timed out, falling back to simple")
407
+ return self._extract_simple(paper)
408
+
409
+ # Use chat_completions with dynamic extraction prompt
410
+ dynamic_prompt = build_extraction_prompt(self.target_properties, self.extra_instructions)
411
+ # For PageIndex chat, we don't need the {title}/{content} placeholders
412
+ # since the document is already indexed; strip those sections.
413
+ pi_prompt = dynamic_prompt.split("**PAPER CONTENT:**")[0].strip()
414
+ raw_answer = service.chat_completions(pi_prompt, doc_id)
415
+
416
+ if not raw_answer:
417
+ return ExtractionResult(
418
+ paper=paper,
419
+ success=False,
420
+ error_message="PageIndex returned empty response"
421
+ )
422
+
423
+ # Parse result
424
+ data_points = self._parse_llm_output(raw_answer, paper.id)
425
+
426
+ for dp in data_points:
427
+ dp.quality_tier = self._assess_quality(dp)
428
+
429
+ return ExtractionResult(
430
+ paper=paper,
431
+ data_points=data_points,
432
+ llm_model_used="pageindex",
433
+ extraction_timestamp=datetime.now(),
434
+ success=True
435
+ )
436
+
437
+ except Exception as e:
438
+ logger.warning(f"PageIndex extraction failed, falling back to simple: {e}")
439
+ return self._extract_simple(paper)
440
+
441
+ def _prepare_content(
442
+ self,
443
+ paper: PaperMetadata,
444
+ use_full_text: bool = True
445
+ ) -> Optional[str]:
446
+ """Prepare text content for extraction."""
447
+ # Try PDF full text first
448
+ if use_full_text and paper.pdf_path and os.path.exists(paper.pdf_path):
449
+ full_text = extract_text_from_pdf(paper.pdf_path, max_pages=5)
450
+ if full_text:
451
+ return f"Title: {paper.title}\n\n{full_text}"
452
+
453
+ # Fallback to abstract
454
+ if paper.abstract:
455
+ return f"Title: {paper.title}\n\nAbstract:\n{paper.abstract}"
456
+
457
+ # Just title
458
+ if paper.title:
459
+ return f"Title: {paper.title}"
460
+
461
+ return None
462
+
463
+ def _call_llm(self, prompt: str) -> Optional[str]:
464
+ """
465
+ Call LLM (OpenAI-compatible first, then Gemini fallback).
466
+ Prioritizes CRC OpenWebUI for reliability.
467
+ """
468
+ # Try OpenAI-compatible (CRC) first
469
+ if self.openai_key and self.openai_base_url:
470
+ try:
471
+ logger.info(f"Calling CRC OpenWebUI...")
472
+ return self._call_openai_compatible(prompt)
473
+ except Exception as e:
474
+ logger.warning(f"CRC OpenWebUI call failed: {e}")
475
+
476
+ # Fallback to Gemini
477
+ if self.gemini_key:
478
+ try:
479
+ logger.info("Falling back to Gemini...")
480
+ return self._call_gemini(prompt)
481
+ except Exception as e:
482
+ logger.warning(f"Gemini call failed: {e}")
483
+
484
+ logger.debug("No LLM backend configured; skipping simple extraction call.")
485
+ return None
486
+
487
+ def _call_gemini(self, prompt: str) -> str:
488
+ """Call Gemini API."""
489
+ import google.generativeai as genai
490
+
491
+ genai.configure(api_key=self.gemini_key)
492
+ model = genai.GenerativeModel("gemini-2.0-flash")
493
+
494
+ response = model.generate_content(prompt)
495
+ return response.text
496
+
497
+ def _call_openai_compatible(self, prompt: str) -> str:
498
+ """Call OpenAI-compatible API (CRC OpenWebUI)."""
499
+ from openai import OpenAI
500
+
501
+ client = OpenAI(
502
+ api_key=self.openai_key,
503
+ base_url=self.openai_base_url
504
+ )
505
+
506
+ # Use model from config (set in .env LLM_MODEL)
507
+ model = self.llm_model
508
+ # Handle litellm-style prefixes
509
+ if model.startswith("gemini/"):
510
+ model = "gpt-oss:latest" # Fallback for CRC
511
+ logger.info(f"Using model: {model}")
512
+
513
+ response = client.chat.completions.create(
514
+ model=model,
515
+ messages=[{"role": "user", "content": prompt}],
516
+ temperature=0.1
517
+ )
518
+
519
+ return response.choices[0].message.content
520
+
521
+ def _parse_llm_output(
522
+ self,
523
+ raw_output: str,
524
+ paper_id: str
525
+ ) -> List[PolymerDataPoint]:
526
+ """Parse LLM output into structured data points."""
527
+ try:
528
+ # Use safe_json_loads for robust parsing
529
+ raw_data = safe_json_loads(raw_output)
530
+ except Exception as e:
531
+ logger.error(f"JSON parsing failed for {paper_id}: {e}")
532
+ return []
533
+
534
+ if raw_data is None:
535
+ logger.warning(f"No JSON data found in output for {paper_id}")
536
+ return []
537
+
538
+ # Ensure it's a list
539
+ if not isinstance(raw_data, list):
540
+ raw_data = [raw_data]
541
+
542
+ # Convert to Pydantic models
543
+ data_points: List[PolymerDataPoint] = []
544
+ for item in raw_data:
545
+ try:
546
+ dp = PolymerDataPoint(
547
+ polymer_name=item.get("polymer_name", "Unknown"),
548
+ dopant=item.get("dopant"),
549
+ dopant_ratio=item.get("dopant_ratio"),
550
+ solvent=item.get("solvent"),
551
+ concentration_mg_ml=item.get("concentration_mg_ml"),
552
+ spin_speed_rpm=item.get("spin_speed_rpm"),
553
+ spin_time_s=item.get("spin_time_s"),
554
+ annealing_temp_c=item.get("annealing_temp_c"),
555
+ annealing_time_min=item.get("annealing_time_min"),
556
+ annealing_atmosphere=item.get("annealing_atmosphere"),
557
+ film_thickness_nm=item.get("film_thickness_nm"),
558
+ electrical_conductivity_s_cm=item.get("electrical_conductivity_s_cm"),
559
+ seebeck_coefficient_uv_k=item.get("seebeck_coefficient_uv_k"),
560
+ power_factor_uw_m_k2=item.get("power_factor_uw_m_k2"),
561
+ thermal_conductivity_w_mk=item.get("thermal_conductivity_w_mk"),
562
+ zt_figure_of_merit=item.get("zt_figure_of_merit"),
563
+ xrd_crystallinity_percent=item.get("xrd_crystallinity_percent"),
564
+ xrd_pi_stacking_angstrom=item.get("xrd_pi_stacking_angstrom"),
565
+ xrd_lamellar_spacing_angstrom=item.get("xrd_lamellar_spacing_angstrom"),
566
+ source_paper_id=paper_id,
567
+ source_table_or_figure=item.get("source_table_or_figure"),
568
+ extraction_confidence=item.get("extraction_confidence", 0.5),
569
+ )
570
+ data_points.append(dp)
571
+ except Exception as e:
572
+ logger.warning(f"Failed to parse data point: {e}")
573
+
574
+ logger.info(f"Extracted {len(data_points)} data points from {paper_id}")
575
+ return data_points
576
+
577
+ def _assess_quality(self, dp: PolymerDataPoint) -> DataQuality:
578
+ """Assess data point quality tier."""
579
+ has_ec = dp.electrical_conductivity_s_cm is not None
580
+ has_tc = dp.thermal_conductivity_w_mk is not None
581
+ has_xrd = (dp.xrd_crystallinity_percent is not None or
582
+ dp.xrd_pi_stacking_angstrom is not None)
583
+ has_process = (dp.annealing_temp_c is not None and
584
+ dp.spin_speed_rpm is not None)
585
+
586
+ if has_ec and has_tc and has_xrd and has_process:
587
+ return DataQuality.GOLD
588
+ elif has_ec and (has_xrd or has_process):
589
+ return DataQuality.SILVER
590
+ else:
591
+ return DataQuality.BRONZE
592
+
593
+
594
+ # ============== NEW: Contextualized Extraction ==============
595
+
596
+ CONTEXTUALIZED_EXTRACTION_PROMPT = """
597
+ You are an expert in organic thermoelectrics and polymer science.
598
+ Extract ALL experimental data points from the provided paper.
599
+
600
+ ## CRITICAL REQUIREMENTS
601
+
602
+ 1. **Extract ALL values, not just the best one**
603
+ - A paper may report multiple values under different conditions
604
+ - Extract EACH value as a separate data point
605
+
606
+ 2. **Include COMPLETE experimental conditions**
607
+ - Every value must have its associated conditions
608
+ - Common: temperature, annealing, doping level, measurement method
609
+
610
+ 3. **MANDATORY: Include source quote**
611
+ - For EACH data point, include the exact sentence from the paper
612
+ - Quote must be >10 characters and reference the value
613
+
614
+ ## TARGET PROPERTIES
615
+
616
+ - `electrical_conductivity` (S/cm, S/m)
617
+ - `thermal_conductivity` (W/mK)
618
+ - `seebeck_coefficient` (μV/K)
619
+ - `power_factor` (μW/mK²)
620
+ - `zt_figure_of_merit` (dimensionless)
621
+
622
+ ## OUTPUT FORMAT (JSON Array)
623
+
624
+ Return ONLY valid JSON, no markdown, no explanation:
625
+
626
+ [
627
+ {{
628
+ "polymer_name": "PEDOT:PSS",
629
+ "dopant": "H2SO4",
630
+ "dopant_ratio": "5 vol%",
631
+ "property_name": "electrical_conductivity",
632
+ "raw_value": "4380",
633
+ "raw_unit": "S/cm",
634
+ "conditions": {{
635
+ "solvent": "water",
636
+ "annealing_temp_c": 150,
637
+ "annealing_time_min": 10,
638
+ "measurement_temp_k": 300,
639
+ "measurement_method": "4-point probe"
640
+ }},
641
+ "source_quote": "The electrical conductivity reached 4380 S/cm after H2SO4 treatment.",
642
+ "source_location": "Table 2, Sample S5",
643
+ "extraction_confidence": 0.95
644
+ }}
645
+ ]
646
+
647
+ ## RULES
648
+
649
+ 1. If values range "from X to Y", extract BOTH as separate points
650
+ 2. Preserve scientific notation as "5.2e3" or actual number
651
+ 3. If no source quote found, set extraction_confidence < 0.5
652
+ 4. Return ONLY valid JSON array, no other text
653
+
654
+ ---
655
+
656
+ **PAPER CONTENT:**
657
+
658
+ Title: {title}
659
+
660
+ {content}
661
+
662
+ ---
663
+
664
+ JSON output:
665
+ """
666
+
667
+
668
+ class ContextualizedExtractor:
669
+ """
670
+ Contextualized data extractor.
671
+
672
+ Produces ContextualizedValue objects with mandatory source quotes for traceability.
673
+ """
674
+
675
+ def __init__(
676
+ self,
677
+ model_id: str = None,
678
+ target_properties: Optional[List[str]] = None,
679
+ extra_instructions: str = "",
680
+ ):
681
+ """
682
+ Initialize extractor.
683
+
684
+ Args:
685
+ model_id: LLM model ID to use (default from config)
686
+ target_properties: List of property keys to extract
687
+ extra_instructions: Free-form LLM instructions
688
+ """
689
+ config = get_config()
690
+ self.model_id = model_id or config.llm_model
691
+ self.openai_base_url = _normalize_base_url(config.openai_base_url)
692
+ self.openai_key = config.openai_api_key
693
+ self.target_properties = target_properties or _DEFAULT_PROPERTIES
694
+ self.extra_instructions = extra_instructions
695
+
696
+ def is_configured(self) -> bool:
697
+ return _is_http_url(self.openai_base_url)
698
+
699
+ def extract_from_paper(
700
+ self,
701
+ paper: PaperMetadata,
702
+ use_full_text: bool = True
703
+ ) -> "ExtractionResult":
704
+ """
705
+ Extract contextualized data from a paper.
706
+
707
+ Args:
708
+ paper: Paper metadata
709
+ use_full_text: Use PDF full text if available
710
+
711
+ Returns:
712
+ ExtractionResult with ContextualizedValue data points
713
+ """
714
+ from .schemas import ContextualizedValue, ExperimentalConditions, ExtractionResult
715
+
716
+ logger.info(f"Contextualized extraction for: {paper.title[:50]}...")
717
+
718
+ if not self.is_configured():
719
+ return ExtractionResult(
720
+ paper_id=paper.id,
721
+ paper_title=paper.title,
722
+ success=False,
723
+ error_message="contextual_llm_unconfigured",
724
+ extraction_notes="Contextualized extraction skipped because no OpenAI-compatible base URL is configured.",
725
+ )
726
+
727
+ # Prepare content
728
+ content = paper.full_text if use_full_text and paper.full_text else paper.abstract
729
+ if not content:
730
+ return ExtractionResult(
731
+ paper_id=paper.id,
732
+ paper_title=paper.title,
733
+ success=False,
734
+ error_message="No content available"
735
+ )
736
+
737
+ # Truncate content to fit context window
738
+ content = content[:15000]
739
+
740
+ # Build dynamic prompt from target properties
741
+ prompt_template = build_extraction_prompt(self.target_properties, self.extra_instructions)
742
+ prompt = prompt_template.replace("{title}", paper.title or "Unknown").replace("{content}", content)
743
+
744
+ try:
745
+ # Call LLM
746
+ raw_response = self._call_llm(prompt)
747
+
748
+ if not raw_response:
749
+ return ExtractionResult(
750
+ paper_id=paper.id,
751
+ paper_title=paper.title,
752
+ success=False,
753
+ error_message="contextual_llm_unconfigured" if not self.is_configured() else "LLM returned empty response"
754
+ )
755
+
756
+ # Parse response
757
+ data_points = self._parse_response(raw_response, paper.id)
758
+
759
+ return ExtractionResult(
760
+ paper_id=paper.id,
761
+ paper_title=paper.title,
762
+ data_points=data_points,
763
+ extraction_model=self.model_id,
764
+ success=True
765
+ )
766
+
767
+ except Exception as e:
768
+ logger.warning(f"Contextualized extraction failed for {paper.id}: {e}")
769
+ return ExtractionResult(
770
+ paper_id=paper.id,
771
+ paper_title=paper.title,
772
+ success=False,
773
+ error_message=str(e)
774
+ )
775
+
776
+ def _call_llm(self, prompt: str) -> Optional[str]:
777
+ """Call LLM via OpenAI-compatible API."""
778
+ import httpx
779
+
780
+ if not self.is_configured():
781
+ logger.debug("Contextualized extractor skipped: OpenAI-compatible base URL is not configured.")
782
+ return None
783
+
784
+ logger.info("Calling LLM for contextualized extraction...")
785
+ logger.info(f"Using model: {self.model_id}")
786
+
787
+ headers = {
788
+ "Content-Type": "application/json",
789
+ }
790
+ if self.openai_key:
791
+ headers["Authorization"] = f"Bearer {self.openai_key}"
792
+
793
+ payload = {
794
+ "model": self.model_id,
795
+ "messages": [{"role": "user", "content": prompt}],
796
+ "temperature": 0.2,
797
+ "max_tokens": 3000,
798
+ }
799
+
800
+ with httpx.Client(timeout=120) as client:
801
+ response = client.post(
802
+ f"{self.openai_base_url}/chat/completions",
803
+ json=payload,
804
+ headers=headers
805
+ )
806
+ response.raise_for_status()
807
+ data = response.json()
808
+
809
+ return data["choices"][0]["message"]["content"]
810
+
811
+ def _parse_response(self, response: str, paper_id: str) -> List:
812
+ """Parse LLM response into ContextualizedValue objects."""
813
+ from .schemas import ContextualizedValue, ExperimentalConditions
814
+
815
+ try:
816
+ data = safe_json_loads(response)
817
+ except Exception as e:
818
+ logger.warning(f"JSON parse failed for {paper_id}: {e}")
819
+ return []
820
+
821
+ if data is None:
822
+ return []
823
+
824
+ if not isinstance(data, list):
825
+ data = [data]
826
+
827
+ results = []
828
+ for item in data:
829
+ if not isinstance(item, dict):
830
+ continue
831
+
832
+ try:
833
+ # Handle conditions
834
+ conditions_data = item.pop("conditions", {})
835
+ conditions = ExperimentalConditions(**conditions_data) if conditions_data else ExperimentalConditions()
836
+
837
+ # Ensure required fields
838
+ if "source_quote" not in item or not item.get("source_quote"):
839
+ item["source_quote"] = f"[Extracted from {paper_id}]"
840
+
841
+ value = ContextualizedValue(
842
+ conditions=conditions,
843
+ **item
844
+ )
845
+ results.append(value)
846
+ except Exception as e:
847
+ logger.warning(f"Failed to parse data point: {e}")
848
+ continue
849
+
850
+ logger.info(f"Extracted {len(results)} contextualized data points from {paper_id}")
851
+ return results
852
+
853
+ def extract_from_papers(
854
+ self,
855
+ papers: List[PaperMetadata],
856
+ use_full_text: bool = True
857
+ ) -> List:
858
+ """Batch extraction from multiple papers."""
859
+ results = []
860
+ for paper in papers:
861
+ result = self.extract_from_paper(paper, use_full_text)
862
+ results.append(result)
863
+ return results
literature/graph.py ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangGraph workflow for Literature Discovery System.
3
+ Implements: discover → download → extract → quality pipeline.
4
+
5
+ Key design principles:
6
+ 1. All state modifications must be explicit in return values
7
+ 2. No in-place object modification
8
+ 3. Each node returns logs for UI feedback
9
+ """
10
+ import logging
11
+ from typing import TypedDict, List, Optional, Annotated, Literal, Callable, Any
12
+ from datetime import datetime
13
+ import operator
14
+
15
+ from langgraph.graph import StateGraph, END, START
16
+ from langgraph.checkpoint.memory import MemorySaver
17
+
18
+ from .schemas import PaperMetadata, PolymerDataPoint, ExtractionResult, DataQuality
19
+ from .discovery import PaperDiscoveryAgent
20
+ from .extraction import DataExtractor
21
+ from .quality import QualityAssessor
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ # ============== State Definition ==============
27
+
28
+ class LogEntry(TypedDict):
29
+ """Log entry for UI feedback"""
30
+ timestamp: str
31
+ node: str
32
+ message: str
33
+ level: str # info, warning, error
34
+
35
+
36
+ class LiteratureState(TypedDict):
37
+ """
38
+ Workflow state.
39
+
40
+ Important: LangGraph state updates are based on return values.
41
+ If you modify a field, you MUST include it in the return dict.
42
+ """
43
+ # Input
44
+ search_query: str
45
+ max_papers: int
46
+ use_full_text: bool
47
+
48
+ # Progress tracking
49
+ current_node: str
50
+ progress_percent: int
51
+
52
+ # Intermediate results
53
+ papers: List[Any] # List[PaperMetadata] serialized
54
+ downloaded_pdfs: List[str]
55
+ extraction_results: List[Any] # List[ExtractionResult] serialized
56
+
57
+ # Final output
58
+ verified_data: List[Any] # List[PolymerDataPoint] serialized
59
+ quality_report: Optional[dict]
60
+
61
+ # Logging & Status
62
+ logs: Annotated[List[LogEntry], operator.add]
63
+ status: Literal["running", "completed", "failed", "cancelled"]
64
+ error: Optional[str]
65
+
66
+
67
+ def create_initial_state(
68
+ query: str,
69
+ max_papers: int = 10,
70
+ use_full_text: bool = False
71
+ ) -> LiteratureState:
72
+ """Create initial state"""
73
+ return {
74
+ "search_query": query,
75
+ "max_papers": max_papers,
76
+ "use_full_text": use_full_text,
77
+ "current_node": "start",
78
+ "progress_percent": 0,
79
+ "papers": [],
80
+ "downloaded_pdfs": [],
81
+ "extraction_results": [],
82
+ "verified_data": [],
83
+ "quality_report": None,
84
+ "logs": [],
85
+ "status": "running",
86
+ "error": None,
87
+ }
88
+
89
+
90
+ # ============== Helper Functions ==============
91
+
92
+ def _log(node: str, message: str, level: str = "info") -> LogEntry:
93
+ """Create log entry"""
94
+ return {
95
+ "timestamp": datetime.now().isoformat(),
96
+ "node": node,
97
+ "message": message,
98
+ "level": level,
99
+ }
100
+
101
+
102
+ def _serialize_paper(paper: PaperMetadata) -> dict:
103
+ """Serialize paper for state storage"""
104
+ return paper.model_dump()
105
+
106
+
107
+ def _deserialize_paper(data: dict) -> PaperMetadata:
108
+ """Deserialize paper from state"""
109
+ return PaperMetadata(**data)
110
+
111
+
112
+ # ============== Node Functions ==============
113
+
114
+ def discover_node(state: LiteratureState) -> dict:
115
+ """
116
+ Paper discovery node.
117
+ Uses existing PaperDiscoveryAgent (synchronous).
118
+ """
119
+ node_name = "discover"
120
+ logs = [_log(node_name, f"Searching for: '{state['search_query']}'")]
121
+
122
+ try:
123
+ agent = PaperDiscoveryAgent()
124
+ papers = agent.discover(
125
+ query=state["search_query"],
126
+ limit_per_source=state["max_papers"],
127
+ )
128
+
129
+ logs.append(_log(node_name, f"Found {len(papers)} unique papers"))
130
+
131
+ # Serialize papers for state storage
132
+ serialized_papers = [_serialize_paper(p) for p in papers]
133
+
134
+ return {
135
+ "papers": serialized_papers,
136
+ "current_node": node_name,
137
+ "progress_percent": 25,
138
+ "logs": logs,
139
+ }
140
+ except Exception as e:
141
+ logger.exception(f"Discover node failed: {e}")
142
+ logs.append(_log(node_name, f"Error: {e}", "error"))
143
+ return {
144
+ "papers": [],
145
+ "current_node": node_name,
146
+ "status": "failed",
147
+ "error": str(e),
148
+ "logs": logs,
149
+ }
150
+
151
+
152
+ def download_node(state: LiteratureState) -> dict:
153
+ """
154
+ PDF download node.
155
+ Uses existing PDFRetriever (synchronous).
156
+ """
157
+ from .retrieval import PDFRetriever
158
+
159
+ node_name = "download"
160
+ paper_dicts = state["papers"]
161
+ logs = [_log(node_name, f"Downloading content for {len(paper_dicts)} papers")]
162
+
163
+ try:
164
+ # Deserialize papers
165
+ papers = [_deserialize_paper(p) for p in paper_dicts]
166
+
167
+ retriever = PDFRetriever()
168
+ papers = retriever.retrieve_batch(papers)
169
+
170
+ # Count successes
171
+ downloaded = [p for p in papers if p.pdf_path]
172
+ logs.append(_log(node_name, f"Downloaded {len(downloaded)}/{len(papers)} PDFs"))
173
+
174
+ # Re-serialize papers with updated pdf_path
175
+ serialized_papers = [_serialize_paper(p) for p in papers]
176
+ downloaded_pdfs = [p.pdf_path for p in downloaded if p.pdf_path]
177
+
178
+ return {
179
+ "papers": serialized_papers,
180
+ "downloaded_pdfs": downloaded_pdfs,
181
+ "current_node": node_name,
182
+ "progress_percent": 50,
183
+ "logs": logs,
184
+ }
185
+ except Exception as e:
186
+ logger.exception(f"Download node failed: {e}")
187
+ logs.append(_log(node_name, f"Error: {e}", "error"))
188
+ return {
189
+ "downloaded_pdfs": [],
190
+ "current_node": node_name,
191
+ "status": "failed",
192
+ "error": str(e),
193
+ "logs": logs,
194
+ }
195
+
196
+
197
+ def extract_node(state: LiteratureState) -> dict:
198
+ """
199
+ Data extraction node.
200
+ Uses existing DataExtractor (synchronous).
201
+ """
202
+ node_name = "extract"
203
+ logs = [_log(node_name, "Extracting structured data from papers")]
204
+
205
+ try:
206
+ # Deserialize papers
207
+ papers = [_deserialize_paper(p) for p in state["papers"]]
208
+
209
+ # Filter papers with content
210
+ papers_with_content = [p for p in papers if p.pdf_path or p.abstract]
211
+
212
+ if not papers_with_content:
213
+ logs.append(_log(node_name, "No papers with content to extract", "warning"))
214
+ return {
215
+ "extraction_results": [],
216
+ "current_node": node_name,
217
+ "progress_percent": 75,
218
+ "logs": logs,
219
+ }
220
+
221
+ logs.append(_log(node_name, f"Processing {len(papers_with_content)} papers with content"))
222
+
223
+ extractor = DataExtractor()
224
+ results = extractor.extract_from_papers(
225
+ papers_with_content,
226
+ use_full_text=state["use_full_text"]
227
+ )
228
+
229
+ total_points = sum(len(r.data_points) for r in results if r.success)
230
+ logs.append(_log(node_name, f"Extracted {total_points} data points from {len(results)} papers"))
231
+
232
+ # Serialize results
233
+ serialized_results = []
234
+ for r in results:
235
+ serialized_results.append({
236
+ "paper_id": r.paper.id if r.paper else "unknown",
237
+ "success": r.success,
238
+ "error_message": r.error_message,
239
+ "data_points": [dp.model_dump() for dp in r.data_points] if r.data_points else [],
240
+ })
241
+
242
+ return {
243
+ "extraction_results": serialized_results,
244
+ "current_node": node_name,
245
+ "progress_percent": 75,
246
+ "logs": logs,
247
+ }
248
+ except Exception as e:
249
+ logger.exception(f"Extract node failed: {e}")
250
+ logs.append(_log(node_name, f"Error: {e}", "error"))
251
+ return {
252
+ "extraction_results": [],
253
+ "current_node": node_name,
254
+ "status": "failed",
255
+ "error": str(e),
256
+ "logs": logs,
257
+ }
258
+
259
+
260
+ def quality_node(state: LiteratureState) -> dict:
261
+ """
262
+ Quality assessment node.
263
+ """
264
+ node_name = "quality"
265
+ logs = [_log(node_name, "Assessing data quality")]
266
+
267
+ try:
268
+ # Collect all data points from serialized results
269
+ all_points: List[PolymerDataPoint] = []
270
+ for result_dict in state["extraction_results"]:
271
+ if result_dict.get("success") and result_dict.get("data_points"):
272
+ for dp_dict in result_dict["data_points"]:
273
+ try:
274
+ dp = PolymerDataPoint(**dp_dict)
275
+ all_points.append(dp)
276
+ except Exception as e:
277
+ logger.warning(f"Failed to deserialize data point: {e}")
278
+
279
+ if not all_points:
280
+ logs.append(_log(node_name, "No data points to assess", "warning"))
281
+ return {
282
+ "verified_data": [],
283
+ "quality_report": None,
284
+ "current_node": node_name,
285
+ "progress_percent": 100,
286
+ "status": "completed",
287
+ "logs": logs,
288
+ }
289
+
290
+ assessor = QualityAssessor()
291
+ verified, report = assessor.assess_batch(all_points)
292
+
293
+ logs.append(_log(node_name, report.summary()))
294
+
295
+ # Serialize
296
+ report_dict = {
297
+ "total_points": report.total_points,
298
+ "gold_count": report.gold_count,
299
+ "silver_count": report.silver_count,
300
+ "bronze_count": report.bronze_count,
301
+ "invalid_count": report.invalid_count,
302
+ "validation_errors": report.validation_errors,
303
+ }
304
+
305
+ verified_data = [dp.model_dump() for dp in verified]
306
+
307
+ return {
308
+ "verified_data": verified_data,
309
+ "quality_report": report_dict,
310
+ "current_node": node_name,
311
+ "progress_percent": 100,
312
+ "status": "completed",
313
+ "logs": logs,
314
+ }
315
+ except Exception as e:
316
+ logger.exception(f"Quality node failed: {e}")
317
+ logs.append(_log(node_name, f"Error: {e}", "error"))
318
+ return {
319
+ "verified_data": [],
320
+ "quality_report": None,
321
+ "current_node": node_name,
322
+ "status": "failed",
323
+ "error": str(e),
324
+ "logs": logs,
325
+ }
326
+
327
+
328
+ # ============== Conditional Edges ==============
329
+
330
+ def should_continue_after_discover(state: LiteratureState) -> str:
331
+ """Should continue after discovery?"""
332
+ if state.get("status") == "failed":
333
+ return "end"
334
+ if not state.get("papers"):
335
+ return "end"
336
+ return "download"
337
+
338
+
339
+ def should_continue_after_download(state: LiteratureState) -> str:
340
+ """Should continue after download?"""
341
+ if state.get("status") == "failed":
342
+ return "end"
343
+ if not state.get("downloaded_pdfs") and not state.get("papers"):
344
+ return "end"
345
+ return "extract"
346
+
347
+
348
+ def should_continue_after_extract(state: LiteratureState) -> str:
349
+ """Should continue after extraction?"""
350
+ if state.get("status") == "failed":
351
+ return "end"
352
+
353
+ # Check if any extraction succeeded
354
+ results = state.get("extraction_results", [])
355
+ total_points = sum(
356
+ len(r.get("data_points", []))
357
+ for r in results
358
+ if r.get("success")
359
+ )
360
+ if total_points == 0:
361
+ return "end"
362
+ return "quality"
363
+
364
+
365
+ # ============== Graph Builder ==============
366
+
367
+ def create_literature_graph(checkpointer=None):
368
+ """
369
+ Create the literature mining workflow graph.
370
+
371
+ Args:
372
+ checkpointer: Optional checkpoint storage (defaults to MemorySaver)
373
+
374
+ Returns:
375
+ Compiled LangGraph
376
+ """
377
+ builder = StateGraph(LiteratureState)
378
+
379
+ # Add nodes
380
+ builder.add_node("discover", discover_node)
381
+ builder.add_node("download", download_node)
382
+ builder.add_node("extract", extract_node)
383
+ builder.add_node("quality", quality_node)
384
+
385
+ # Add edges
386
+ builder.add_edge(START, "discover")
387
+
388
+ builder.add_conditional_edges(
389
+ "discover",
390
+ should_continue_after_discover,
391
+ {"download": "download", "end": END}
392
+ )
393
+
394
+ builder.add_conditional_edges(
395
+ "download",
396
+ should_continue_after_download,
397
+ {"extract": "extract", "end": END}
398
+ )
399
+
400
+ builder.add_conditional_edges(
401
+ "extract",
402
+ should_continue_after_extract,
403
+ {"quality": "quality", "end": END}
404
+ )
405
+
406
+ builder.add_edge("quality", END)
407
+
408
+ # Compile
409
+ if checkpointer is None:
410
+ checkpointer = MemorySaver()
411
+
412
+ graph = builder.compile(checkpointer=checkpointer)
413
+
414
+ return graph
415
+
416
+
417
+ # ============== Sync Runner ==============
418
+
419
+ def run_workflow(
420
+ query: str,
421
+ max_papers: int = 10,
422
+ use_full_text: bool = False,
423
+ thread_id: str = "default",
424
+ on_state_update: Optional[Callable[[LiteratureState], None]] = None,
425
+ ) -> LiteratureState:
426
+ """
427
+ Run the literature mining workflow (synchronous).
428
+
429
+ Args:
430
+ query: Search query
431
+ max_papers: Max papers per source
432
+ use_full_text: Whether to use full text extraction
433
+ thread_id: Thread ID for state recovery
434
+ on_state_update: Callback for state updates
435
+
436
+ Returns:
437
+ Final state
438
+ """
439
+ graph = create_literature_graph()
440
+ initial_state = create_initial_state(query, max_papers, use_full_text)
441
+
442
+ config = {"configurable": {"thread_id": thread_id}}
443
+
444
+ final_state = None
445
+ for event in graph.stream(initial_state, config, stream_mode="values"):
446
+ final_state = event
447
+ if on_state_update:
448
+ on_state_update(event)
449
+
450
+ return final_state
literature/property_registry.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Property catalog and extraction prompt builder for production literature mining.
3
+
4
+ This registry is aligned to the platform's public property keys so staged
5
+ literature evidence can be consumed by Property Probe and Discovery without
6
+ ad-hoc remapping.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from typing import Dict, List, Optional
12
+
13
+
14
+ PROPERTY_CATALOG: Dict[str, Dict[str, str]] = {
15
+ # Thermal
16
+ "tm": {"name": "Melting temperature", "unit": "K"},
17
+ "tg": {"name": "Glass transition temperature", "unit": "K"},
18
+ "td": {"name": "Thermal diffusivity", "unit": "m^2/s"},
19
+ "tc": {"name": "Thermal conductivity", "unit": "W/(m*K)"},
20
+ "cp": {"name": "Specific heat capacity", "unit": "J/(kg*K)"},
21
+ # Mechanical
22
+ "young": {"name": "Young's modulus", "unit": "GPa"},
23
+ "shear": {"name": "Shear modulus", "unit": "GPa"},
24
+ "bulk": {"name": "Bulk modulus", "unit": "GPa"},
25
+ "poisson": {"name": "Poisson ratio", "unit": "dimensionless"},
26
+ # Transport
27
+ "visc": {"name": "Viscosity", "unit": "Pa*s"},
28
+ "dif": {"name": "Diffusivity", "unit": "cm^2/s"},
29
+ # Gas permeability
30
+ "phe": {"name": "He permeability", "unit": "Barrer"},
31
+ "ph2": {"name": "H2 permeability", "unit": "Barrer"},
32
+ "pco2": {"name": "CO2 permeability", "unit": "Barrer"},
33
+ "pn2": {"name": "N2 permeability", "unit": "Barrer"},
34
+ "po2": {"name": "O2 permeability", "unit": "Barrer"},
35
+ "pch4": {"name": "CH4 permeability", "unit": "Barrer"},
36
+ # Electronic / optical
37
+ "alpha": {"name": "Polarizability", "unit": "a.u."},
38
+ "homo": {"name": "HOMO energy", "unit": "eV"},
39
+ "lumo": {"name": "LUMO energy", "unit": "eV"},
40
+ "bandgap": {"name": "Band gap", "unit": "eV"},
41
+ "mu": {"name": "Dipole moment", "unit": "Debye"},
42
+ "etotal": {"name": "Total electronic energy", "unit": "eV"},
43
+ "ri": {"name": "Refractive index", "unit": "dimensionless"},
44
+ "dc": {"name": "Dielectric constant", "unit": "dimensionless"},
45
+ "pe": {"name": "Permittivity", "unit": "dimensionless"},
46
+ # Structural / physical
47
+ "rg": {"name": "Radius of gyration", "unit": "Angstrom"},
48
+ "rho": {"name": "Density", "unit": "g/cm^3"},
49
+ # Extended literature-only properties retained for discovery/search
50
+ "electrical_conductivity": {"name": "Electrical conductivity", "unit": "S/cm"},
51
+ "seebeck_coefficient": {"name": "Seebeck coefficient", "unit": "uV/K"},
52
+ "power_factor": {"name": "Power factor", "unit": "uW/(m*K^2)"},
53
+ "zt_figure_of_merit": {"name": "ZT figure of merit", "unit": "dimensionless"},
54
+ "tensile_strength": {"name": "Tensile strength", "unit": "MPa"},
55
+ "elongation_at_break": {"name": "Elongation at break", "unit": "%"},
56
+ "crystallinity": {"name": "Crystallinity", "unit": "%"},
57
+ }
58
+
59
+
60
+ PLATFORM_PROPERTY_KEYS = [
61
+ "tm", "tg", "td", "tc", "cp",
62
+ "young", "shear", "bulk", "poisson",
63
+ "visc", "dif",
64
+ "phe", "ph2", "pco2", "pn2", "po2", "pch4",
65
+ "alpha", "homo", "lumo", "bandgap", "mu", "etotal", "ri", "dc", "pe",
66
+ "rg", "rho",
67
+ ]
68
+
69
+
70
+ TEMPLATES: Dict[str, List[str]] = {
71
+ "thermal": ["tm", "tg", "td", "tc", "cp"],
72
+ "mechanical": ["young", "shear", "bulk", "poisson", "tensile_strength", "elongation_at_break"],
73
+ "electronic": ["bandgap", "homo", "lumo", "ri", "dc", "pe", "alpha", "mu", "etotal"],
74
+ "gas_permeability": ["pco2", "po2", "pn2", "ph2", "phe", "pch4"],
75
+ "transport": ["visc", "dif", "tc", "electrical_conductivity", "seebeck_coefficient", "power_factor"],
76
+ "platform_core": PLATFORM_PROPERTY_KEYS,
77
+ }
78
+
79
+ TEMPLATE_LABELS: Dict[str, str] = {
80
+ "thermal": "Thermal",
81
+ "mechanical": "Mechanical",
82
+ "electronic": "Electronic / Optical",
83
+ "gas_permeability": "Gas Permeability",
84
+ "transport": "Transport / Energy",
85
+ "platform_core": "Platform Core",
86
+ }
87
+
88
+
89
+ PROPERTY_ALIASES: Dict[str, str] = {
90
+ "thermal conductivity": "tc",
91
+ "heat conductivity": "tc",
92
+ "thermal diffusivity": "td",
93
+ "heat diffusivity": "td",
94
+ "specific heat": "cp",
95
+ "heat capacity": "cp",
96
+ "young modulus": "young",
97
+ "youngs modulus": "young",
98
+ "young_s_modulus": "young",
99
+ "young_modulus": "young",
100
+ "shear modulus": "shear",
101
+ "shear_modulus": "shear",
102
+ "bulk modulus": "bulk",
103
+ "bulk_modulus": "bulk",
104
+ "poisson ratio": "poisson",
105
+ "poisson_ratio": "poisson",
106
+ "viscosity": "visc",
107
+ "diffusivity": "dif",
108
+ "he permeability": "phe",
109
+ "helium permeability": "phe",
110
+ "h2 permeability": "ph2",
111
+ "co2 permeability": "pco2",
112
+ "n2 permeability": "pn2",
113
+ "o2 permeability": "po2",
114
+ "ch4 permeability": "pch4",
115
+ "polarizability": "alpha",
116
+ "homo energy": "homo",
117
+ "lumo energy": "lumo",
118
+ "band gap": "bandgap",
119
+ "bandgap": "bandgap",
120
+ "dipole moment": "mu",
121
+ "total electronic energy": "etotal",
122
+ "refractive index": "ri",
123
+ "dielectric constant": "dc",
124
+ "permittivity": "pe",
125
+ "radius of gyration": "rg",
126
+ "density": "rho",
127
+ "electrical conductivity": "electrical_conductivity",
128
+ "conductivity": "electrical_conductivity",
129
+ "seebeck coefficient": "seebeck_coefficient",
130
+ "power factor": "power_factor",
131
+ "zt": "zt_figure_of_merit",
132
+ "zt figure of merit": "zt_figure_of_merit",
133
+ "tensile strength": "tensile_strength",
134
+ "elongation at break": "elongation_at_break",
135
+ "co2_permeability": "pco2",
136
+ "o2_permeability": "po2",
137
+ "n2_permeability": "pn2",
138
+ "h2_permeability": "ph2",
139
+ "he_permeability": "phe",
140
+ "ch4_permeability": "pch4",
141
+ "radius_of_gyration": "rg",
142
+ "refractive_index": "ri",
143
+ "dielectric_constant": "dc",
144
+ "dipole_moment": "mu",
145
+ }
146
+
147
+
148
+ def _norm(text: str) -> str:
149
+ normalized = re.sub(r"[^a-z0-9]+", " ", str(text or "").strip().lower())
150
+ return re.sub(r"\s+", " ", normalized).strip()
151
+
152
+
153
+ for key, meta in PROPERTY_CATALOG.items():
154
+ PROPERTY_ALIASES.setdefault(_norm(key), key)
155
+ PROPERTY_ALIASES.setdefault(_norm(meta["name"]), key)
156
+
157
+
158
+ def normalize_property_key(value: str | None) -> Optional[str]:
159
+ """Map free-form property text to a canonical registry key."""
160
+ if not value:
161
+ return None
162
+ key = PROPERTY_ALIASES.get(_norm(value))
163
+ if key in PROPERTY_CATALOG:
164
+ return key
165
+ return None
166
+
167
+
168
+ def detect_property_keys(text: str) -> List[str]:
169
+ """Return all unique property keys that appear in the free-form text."""
170
+ haystack = _norm(text)
171
+ out: List[str] = []
172
+ for alias, key in PROPERTY_ALIASES.items():
173
+ if alias and alias in haystack and key not in out:
174
+ out.append(key)
175
+ return out
176
+
177
+
178
+ def property_display_name(key: str) -> str:
179
+ meta = PROPERTY_CATALOG.get(key)
180
+ if not meta:
181
+ return key
182
+ return f"{meta['name']} ({meta['unit']})"
183
+
184
+
185
+ def _property_list_block(property_keys: List[str]) -> str:
186
+ """Build the target-properties section of the extraction prompt."""
187
+ lines = []
188
+ for key in property_keys:
189
+ meta = PROPERTY_CATALOG.get(key)
190
+ if meta:
191
+ lines.append(f"- `{key}` ({meta['name']}) -- standard unit: {meta['unit']}")
192
+ else:
193
+ lines.append(f"- `{key}`")
194
+ return "\n".join(lines)
195
+
196
+
197
+ def build_extraction_prompt(
198
+ property_keys: List[str],
199
+ extra_instructions: str = "",
200
+ ) -> str:
201
+ """
202
+ Build a dynamic contextualized extraction prompt from the given property list.
203
+ """
204
+ normalized_keys = [normalize_property_key(k) or k for k in property_keys if k]
205
+ props_block = _property_list_block(normalized_keys)
206
+
207
+ extra_section = ""
208
+ if extra_instructions.strip():
209
+ extra_section = f"""
210
+ ## ADDITIONAL CONTEXT
211
+
212
+ {extra_instructions.strip()}
213
+ """
214
+
215
+ prompt = f"""You are an expert in polymer science and materials characterization.
216
+ Extract experimentally grounded evidence records from the provided paper.
217
+
218
+ ## CRITICAL REQUIREMENTS
219
+
220
+ 1. Extract each material-property-value observation as a separate record
221
+ 2. Preserve the original value and unit exactly as written
222
+ 3. Include experimental conditions and measurement method whenever available
223
+ 4. Include a source quote and source location for every record
224
+ 5. Ignore theoretical-only values unless the paper explicitly reports an experiment-backed measurement
225
+
226
+ ## TARGET PROPERTIES
227
+
228
+ For each data point, extract these properties:
229
+ {props_block}
230
+ {extra_section}
231
+ ## OUTPUT FORMAT (JSON Array)
232
+
233
+ Return ONLY valid JSON, no markdown, no explanation:
234
+
235
+ [
236
+ {{
237
+ "polymer_name": "P3HT",
238
+ "property_name": "<one of the target property keys above>",
239
+ "raw_value": "1.9",
240
+ "raw_unit": "eV",
241
+ "conditions": {{
242
+ "solvent": "chloroform",
243
+ "annealing_temp_c": 150,
244
+ "annealing_time_min": 10,
245
+ "measurement_temp_k": 300,
246
+ "measurement_method": "UV-Vis"
247
+ }},
248
+ "source_quote": "The optical band gap of P3HT was determined to be 1.9 eV from the UV-Vis absorption onset.",
249
+ "source_location": "Table 1",
250
+ "extraction_confidence": 0.95
251
+ }}
252
+ ]
253
+
254
+ ## RULES
255
+
256
+ 1. If values range "from X to Y", extract BOTH as separate points
257
+ 2. Preserve scientific notation as "5.2e3" or actual number
258
+ 3. If no source quote is available, lower extraction_confidence below 0.5
259
+ 4. Prefer experimentally measured values over model predictions or simulations
260
+ 5. Return ONLY a valid JSON array, no extra text
261
+
262
+ ---
263
+
264
+ **PAPER CONTENT:**
265
+
266
+ Title: {{title}}
267
+
268
+ {{content}}
269
+
270
+ ---
271
+
272
+ JSON output:
273
+ """
274
+ return prompt
literature/quality.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Production quality assessment and validation for literature evidence.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+ from dataclasses import dataclass
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ from .schemas import ContextualizedValue, DataQuality, PolymerDataPoint
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ @dataclass
16
+ class QualityReport:
17
+ """Data quality report for a batch of data points."""
18
+ total_points: int
19
+ gold_count: int
20
+ silver_count: int
21
+ bronze_count: int
22
+ invalid_count: int
23
+ validation_errors: List[str]
24
+
25
+ @property
26
+ def gold_ratio(self) -> float:
27
+ return self.gold_count / max(self.total_points, 1)
28
+
29
+ def summary(self) -> str:
30
+ return (
31
+ f"Quality Report: {self.total_points} points\n"
32
+ f" Gold: {self.gold_count} ({self.gold_ratio:.1%})\n"
33
+ f" Silver: {self.silver_count}\n"
34
+ f" Bronze: {self.bronze_count}\n"
35
+ f" Invalid: {self.invalid_count}\n"
36
+ f" Errors: {len(self.validation_errors)}"
37
+ )
38
+
39
+
40
+ class QualityAssessor:
41
+ """Quality assessor with property-aware sanity checks."""
42
+
43
+ PROPERTY_BOUNDS: Dict[str, Tuple[Optional[float], Optional[float]]] = {
44
+ "tm": (50, 2000),
45
+ "tg": (50, 2000),
46
+ "td": (1e-10, 1.0),
47
+ "tc": (1e-4, 1000.0),
48
+ "cp": (1.0, 1e7),
49
+ "young": (1e-6, 1e5),
50
+ "shear": (1e-6, 1e5),
51
+ "bulk": (1e-6, 1e5),
52
+ "poisson": (-1.0, 0.5),
53
+ "visc": (1e-9, 1e9),
54
+ "dif": (1e-12, 10.0),
55
+ "rho": (1e-6, 100.0),
56
+ "ri": (0.5, 10.0),
57
+ "bandgap": (-20.0, 20.0),
58
+ "homo": (-30.0, 10.0),
59
+ "lumo": (-30.0, 20.0),
60
+ "mu": (0.0, 1e4),
61
+ "electrical_conductivity": (1e-12, 1e8),
62
+ "seebeck_coefficient": (-1e5, 1e5),
63
+ "power_factor": (0.0, 1e9),
64
+ "zt_figure_of_merit": (0.0, 1e4),
65
+ }
66
+
67
+ def __init__(self) -> None:
68
+ self.errors: List[str] = []
69
+
70
+ def assess_batch(self, data_points: List[PolymerDataPoint]) -> Tuple[List[PolymerDataPoint], QualityReport]:
71
+ """Legacy compatibility path used by older scripts."""
72
+ self.errors = []
73
+ valid_points: List[PolymerDataPoint] = []
74
+ gold_count = silver_count = bronze_count = invalid_count = 0
75
+
76
+ for dp in data_points:
77
+ is_valid, error_msg = self._validate_legacy(dp)
78
+ if not is_valid:
79
+ self.errors.append(f"{dp.source_paper_id}: {error_msg}")
80
+ invalid_count += 1
81
+ continue
82
+
83
+ dp.quality_tier = self._compute_legacy_quality_tier(dp)
84
+ if dp.quality_tier == DataQuality.GOLD:
85
+ gold_count += 1
86
+ elif dp.quality_tier == DataQuality.SILVER:
87
+ silver_count += 1
88
+ else:
89
+ bronze_count += 1
90
+ valid_points.append(dp)
91
+
92
+ report = QualityReport(
93
+ total_points=len(data_points),
94
+ gold_count=gold_count,
95
+ silver_count=silver_count,
96
+ bronze_count=bronze_count,
97
+ invalid_count=invalid_count,
98
+ validation_errors=self.errors.copy(),
99
+ )
100
+ logger.info(report.summary())
101
+ return valid_points, report
102
+
103
+ def validate_contextual_value(self, value: ContextualizedValue) -> Tuple[bool, Optional[str]]:
104
+ if not value.polymer_name or value.polymer_name.strip().lower() == "unknown":
105
+ return False, "Missing material name"
106
+ if not value.property_name:
107
+ return False, "Missing property key"
108
+ if value.standardized_value is None:
109
+ return False, "Missing standardized value"
110
+ if not value.source_quote or len(value.source_quote.strip()) < 10:
111
+ return False, "Missing source quote"
112
+
113
+ bounds = self.PROPERTY_BOUNDS.get(value.property_name)
114
+ if bounds is None:
115
+ return True, None
116
+
117
+ low, high = bounds
118
+ numeric = value.standardized_value
119
+ if low is not None and numeric < low:
120
+ return False, f"Value below plausible range: {numeric}"
121
+ if high is not None and numeric > high:
122
+ return False, f"Value above plausible range: {numeric}"
123
+ return True, None
124
+
125
+ def assess_contextual_quality(self, value: ContextualizedValue) -> DataQuality:
126
+ score = 0
127
+ if value.standardized_value is not None:
128
+ score += 2
129
+ if value.conditions.to_dict():
130
+ score += min(len(value.conditions.to_dict()), 3)
131
+ if value.conditions.measurement_method:
132
+ score += 1
133
+ if value.source_location:
134
+ score += 1
135
+ if value.extraction_confidence >= 0.9:
136
+ score += 2
137
+ elif value.extraction_confidence >= 0.7:
138
+ score += 1
139
+
140
+ if score >= 7:
141
+ return DataQuality.GOLD
142
+ if score >= 4:
143
+ return DataQuality.SILVER
144
+ return DataQuality.BRONZE
145
+
146
+ def _validate_legacy(self, dp: PolymerDataPoint) -> Tuple[bool, Optional[str]]:
147
+ if not dp.polymer_name or dp.polymer_name == "Unknown":
148
+ return False, "Missing polymer name"
149
+ has_measurement = any([
150
+ dp.electrical_conductivity_s_cm is not None,
151
+ dp.thermal_conductivity_w_mk is not None,
152
+ dp.seebeck_coefficient_uv_k is not None,
153
+ ])
154
+ if not has_measurement:
155
+ return False, "No measurement values"
156
+ return True, None
157
+
158
+ def _compute_legacy_quality_tier(self, dp: PolymerDataPoint) -> DataQuality:
159
+ score = 0
160
+ if dp.electrical_conductivity_s_cm is not None:
161
+ score += 3
162
+ if dp.seebeck_coefficient_uv_k is not None:
163
+ score += 2
164
+ if dp.power_factor_uw_m_k2 is not None:
165
+ score += 1
166
+ if dp.thermal_conductivity_w_mk is not None:
167
+ score += 4
168
+ if dp.source_table_or_figure:
169
+ score += 1
170
+ if dp.annealing_temp_c is not None:
171
+ score += 1
172
+ if score >= 7:
173
+ return DataQuality.GOLD
174
+ if score >= 4:
175
+ return DataQuality.SILVER
176
+ return DataQuality.BRONZE
literature/retrieval.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF retrieval module.
3
+ Downloads papers from ArXiv (priority) and via Unpaywall.
4
+ Implements robust header spoofing and graceful error handling.
5
+ """
6
+ import logging
7
+ import os
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Optional, List
11
+ import requests
12
+
13
+ from .schemas import PaperMetadata, PaperSource
14
+ from .config import get_config
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class PDFRetriever:
20
+ """
21
+ PDF retrieval with robust error handling.
22
+
23
+ Priority:
24
+ 1. ArXiv (direct, free, reliable)
25
+ 2. Existing pdf_url from metadata
26
+ 3. Unpaywall via DOI
27
+ """
28
+
29
+ def __init__(self) -> None:
30
+ config = get_config()
31
+ self.storage_dir = Path(config.pdf_storage_dir)
32
+ self.storage_dir.mkdir(parents=True, exist_ok=True)
33
+
34
+ # Robust headers to avoid 403
35
+ self.headers = {
36
+ "User-Agent": config.user_agent,
37
+ "Accept": "application/pdf,*/*",
38
+ "Accept-Language": "en-US,en;q=0.9",
39
+ "Accept-Encoding": "gzip, deflate, br",
40
+ "Connection": "keep-alive",
41
+ }
42
+
43
+ self.timeout = 60 # seconds
44
+ self.unpaywall_email = config.pubmed_email
45
+
46
+ def retrieve_batch(
47
+ self,
48
+ papers: List[PaperMetadata],
49
+ skip_existing: bool = True
50
+ ) -> List[PaperMetadata]:
51
+ """
52
+ Download PDFs for a batch of papers.
53
+ Updates paper.pdf_path for successful downloads.
54
+ Saves all papers and failed downloads to CSVs.
55
+
56
+ Args:
57
+ papers: List of paper metadata
58
+ skip_existing: Skip if PDF already exists
59
+
60
+ Returns:
61
+ Updated list of papers with pdf_path set where successful
62
+ """
63
+ successful_ids: set = set()
64
+ failed_papers: List[PaperMetadata] = []
65
+
66
+ for paper in papers:
67
+ try:
68
+ pdf_path = self.retrieve_single(paper, skip_existing=skip_existing)
69
+ if pdf_path:
70
+ paper.pdf_path = pdf_path
71
+ successful_ids.add(paper.id)
72
+ else:
73
+ failed_papers.append(paper)
74
+ except Exception as e:
75
+ logger.warning(f"PDF retrieval failed for {paper.id}: {e}")
76
+ failed_papers.append(paper)
77
+
78
+ logger.info(f"PDF retrieval complete: {len(successful_ids)} successful, {len(failed_papers)} failed")
79
+
80
+ # Save all papers with download status
81
+ self._save_all_papers(papers, successful_ids)
82
+
83
+ # Save failed downloads for manual retrieval
84
+ if failed_papers:
85
+ self._save_failed_downloads(failed_papers)
86
+
87
+ return papers
88
+
89
+ def _save_failed_downloads(self, papers: List[PaperMetadata]) -> None:
90
+ """Save failed downloads to CSV for manual retrieval."""
91
+ import csv
92
+ from datetime import datetime
93
+
94
+ csv_path = self.storage_dir / "failed_downloads.csv"
95
+ file_exists = csv_path.exists()
96
+
97
+ with open(csv_path, "a", newline="", encoding="utf-8") as f:
98
+ writer = csv.writer(f)
99
+
100
+ # Write header if new file
101
+ if not file_exists:
102
+ writer.writerow([
103
+ "timestamp", "paper_id", "title", "source", "doi", "url", "expected_filename"
104
+ ])
105
+
106
+ timestamp = datetime.now().isoformat()
107
+ for paper in papers:
108
+ safe_id = paper.id.replace("/", "_").replace(":", "_")
109
+ expected_filename = f"{safe_id}.pdf"
110
+ writer.writerow([
111
+ timestamp,
112
+ paper.id,
113
+ paper.title[:100], # Truncate long titles
114
+ paper.source.value,
115
+ paper.doi or "",
116
+ paper.url or "",
117
+ expected_filename
118
+ ])
119
+
120
+ logger.info(f"Saved {len(papers)} failed downloads to {csv_path}")
121
+
122
+ def _save_all_papers(
123
+ self,
124
+ papers: List[PaperMetadata],
125
+ successful_ids: set
126
+ ) -> None:
127
+ """Save all discovered papers to CSV with download status."""
128
+ import csv
129
+ from datetime import datetime
130
+
131
+ csv_path = self.storage_dir / "all_papers.csv"
132
+
133
+ with open(csv_path, "w", newline="", encoding="utf-8") as f:
134
+ writer = csv.writer(f)
135
+ writer.writerow([
136
+ "paper_id", "title", "source", "year", "doi", "url",
137
+ "pdf_downloaded", "pdf_path", "timestamp"
138
+ ])
139
+
140
+ timestamp = datetime.now().isoformat()
141
+ for paper in papers:
142
+ downloaded = paper.id in successful_ids or paper.pdf_path is not None
143
+ writer.writerow([
144
+ paper.id,
145
+ paper.title[:150],
146
+ paper.source.value,
147
+ paper.year or "",
148
+ paper.doi or "",
149
+ paper.url or "",
150
+ "YES" if downloaded else "NO",
151
+ paper.pdf_path or "",
152
+ timestamp
153
+ ])
154
+
155
+ logger.info(f"Saved {len(papers)} papers to {csv_path}")
156
+
157
+ def retrieve_single(
158
+ self,
159
+ paper: PaperMetadata,
160
+ skip_existing: bool = True
161
+ ) -> Optional[str]:
162
+ """
163
+ Download PDF for a single paper.
164
+
165
+ Args:
166
+ paper: Paper metadata
167
+ skip_existing: Skip if file already exists
168
+
169
+ Returns:
170
+ Path to downloaded PDF, or None if failed
171
+ """
172
+ # Determine filename
173
+ safe_id = paper.id.replace("/", "_").replace(":", "_")
174
+ pdf_filename = f"{safe_id}.pdf"
175
+ pdf_path = self.storage_dir / pdf_filename
176
+
177
+ # Check if already exists
178
+ if skip_existing and pdf_path.exists():
179
+ logger.debug(f"PDF already exists: {pdf_path}")
180
+ return str(pdf_path)
181
+
182
+ # Try download methods in priority order
183
+ pdf_url = self._get_pdf_url(paper)
184
+
185
+ if pdf_url:
186
+ success = self._download_pdf(pdf_url, pdf_path)
187
+ if success:
188
+ logger.info(f"Downloaded PDF: {pdf_path}")
189
+ return str(pdf_path)
190
+
191
+ logger.warning(f"Could not download PDF for {paper.id}")
192
+ return None
193
+
194
+ def _get_pdf_url(self, paper: PaperMetadata) -> Optional[str]:
195
+ """
196
+ Get PDF URL using priority order:
197
+ 1. ArXiv direct link
198
+ 2. PubMed Central (PMC) for PubMed papers
199
+ 3. Existing pdf_url from metadata
200
+ 4. Unpaywall via DOI
201
+ """
202
+ # Priority 1: ArXiv (most reliable, free)
203
+ if paper.source == PaperSource.ARXIV:
204
+ arxiv_id = paper.id.replace("arxiv_", "")
205
+ return f"https://arxiv.org/pdf/{arxiv_id}.pdf"
206
+
207
+ # Priority 2: PubMed - try PMC first
208
+ if paper.source == PaperSource.PUBMED:
209
+ pmc_url = self._get_pmc_pdf_url(paper)
210
+ if pmc_url:
211
+ return pmc_url
212
+
213
+ # Priority 3: Use existing pdf_url if available
214
+ if paper.pdf_url:
215
+ return paper.pdf_url
216
+
217
+ # Priority 4: Try Unpaywall via DOI (works for all sources)
218
+ if paper.doi:
219
+ unpaywall_url = self._get_unpaywall_url(paper.doi)
220
+ if unpaywall_url:
221
+ return unpaywall_url
222
+
223
+ return None
224
+
225
+ def _get_pmc_pdf_url(self, paper: PaperMetadata) -> Optional[str]:
226
+ """
227
+ Try to get PDF from PubMed Central (PMC).
228
+ PMC provides free full-text PDFs for many PubMed articles.
229
+ """
230
+ try:
231
+ pmid = paper.id.replace("pubmed_", "")
232
+
233
+ # Try elink to get PMC ID
234
+ from Bio import Entrez
235
+ handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid)
236
+ record = Entrez.read(handle)
237
+ handle.close()
238
+
239
+ # Check if PMC ID exists
240
+ link_sets = record[0].get("LinkSetDb", [])
241
+ for link_set in link_sets:
242
+ if link_set.get("DbTo") == "pmc":
243
+ links = link_set.get("Link", [])
244
+ if links:
245
+ pmc_id = links[0]["Id"]
246
+ # PMC PDF URL format
247
+ return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/pdf/"
248
+
249
+ return None
250
+
251
+ except Exception as e:
252
+ logger.debug(f"PMC lookup failed for {paper.id}: {e}")
253
+ return None
254
+
255
+ def _get_unpaywall_url(self, doi: str) -> Optional[str]:
256
+ """
257
+ Query Unpaywall API for open-access PDF URL.
258
+
259
+ Args:
260
+ doi: Paper DOI
261
+
262
+ Returns:
263
+ PDF URL if found, None otherwise
264
+ """
265
+ try:
266
+ url = f"https://api.unpaywall.org/v2/{doi}"
267
+ params = {"email": self.unpaywall_email}
268
+
269
+ response = requests.get(
270
+ url,
271
+ params=params,
272
+ headers=self.headers,
273
+ timeout=30
274
+ )
275
+
276
+ if response.status_code != 200:
277
+ logger.debug(f"Unpaywall returned {response.status_code} for {doi}")
278
+ return None
279
+
280
+ data = response.json()
281
+
282
+ # Check for best open access location
283
+ best_oa = data.get("best_oa_location")
284
+ if best_oa and best_oa.get("url_for_pdf"):
285
+ return best_oa["url_for_pdf"]
286
+
287
+ # Check all OA locations
288
+ oa_locations = data.get("oa_locations", [])
289
+ for loc in oa_locations:
290
+ if loc.get("url_for_pdf"):
291
+ return loc["url_for_pdf"]
292
+
293
+ return None
294
+
295
+ except Exception as e:
296
+ logger.debug(f"Unpaywall query failed for {doi}: {e}")
297
+ return None
298
+
299
+ def _download_pdf(self, url: str, save_path: Path) -> bool:
300
+ """
301
+ Download PDF from URL with robust error handling.
302
+
303
+ Args:
304
+ url: PDF URL
305
+ save_path: Local path to save file
306
+
307
+ Returns:
308
+ True if successful, False otherwise
309
+ """
310
+ try:
311
+ logger.debug(f"Downloading PDF from: {url}")
312
+
313
+ response = requests.get(
314
+ url,
315
+ headers=self.headers,
316
+ timeout=self.timeout,
317
+ stream=True,
318
+ allow_redirects=True
319
+ )
320
+
321
+ # Check for success
322
+ if response.status_code != 200:
323
+ logger.warning(f"Download failed with status {response.status_code}: {url}")
324
+ return False
325
+
326
+ # Verify it's a PDF (check content-type or magic bytes)
327
+ content_type = response.headers.get("content-type", "").lower()
328
+ if "pdf" not in content_type and "octet-stream" not in content_type:
329
+ # Check magic bytes as fallback
330
+ first_bytes = response.content[:8]
331
+ if not first_bytes.startswith(b"%PDF"):
332
+ logger.warning(f"Response is not a PDF: {content_type}")
333
+ return False
334
+
335
+ # Save to file
336
+ with open(save_path, "wb") as f:
337
+ for chunk in response.iter_content(chunk_size=8192):
338
+ f.write(chunk)
339
+
340
+ # Verify file was written
341
+ if save_path.exists() and save_path.stat().st_size > 0:
342
+ return True
343
+
344
+ return False
345
+
346
+ except requests.exceptions.Timeout:
347
+ logger.warning(f"Download timeout: {url}")
348
+ return False
349
+ except requests.exceptions.RequestException as e:
350
+ logger.warning(f"Download error: {e}")
351
+ return False
352
+ except Exception as e:
353
+ logger.error(f"Unexpected error downloading {url}: {e}")
354
+ return False
355
+
356
+
357
+ def extract_text_from_pdf(pdf_path: str, max_pages: int = 100) -> Optional[str]:
358
+ """
359
+ Extract text from PDF using pymupdf.
360
+
361
+ Args:
362
+ pdf_path: Path to PDF file
363
+ max_pages: Maximum pages to extract (default 5)
364
+
365
+ Returns:
366
+ Extracted text, or None if failed
367
+ """
368
+ try:
369
+ import pymupdf # fitz
370
+ except ImportError:
371
+ try:
372
+ import fitz as pymupdf
373
+ except ImportError:
374
+ logger.error("pymupdf not installed. Run: pip install pymupdf")
375
+ return None
376
+
377
+ try:
378
+ doc = pymupdf.open(pdf_path)
379
+ text_parts: List[str] = []
380
+
381
+ pages_to_extract = min(len(doc), max_pages)
382
+
383
+ for page_num in range(pages_to_extract):
384
+ page = doc[page_num]
385
+ text = page.get_text()
386
+ if text:
387
+ text_parts.append(f"--- Page {page_num + 1} ---\n{text}")
388
+
389
+ doc.close()
390
+
391
+ full_text = "\n\n".join(text_parts)
392
+ logger.info(f"Extracted {len(full_text)} chars from {pages_to_extract} pages of {pdf_path}")
393
+
394
+ return full_text if full_text.strip() else None
395
+
396
+ except Exception as e:
397
+ logger.error(f"PDF text extraction failed for {pdf_path}: {e}")
398
+ return None
literature/schemas.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Domain-specific data models for literature mining.
3
+ Supports contextualized extraction with source traceability.
4
+ """
5
+ from typing import Optional, List, Dict, Any
6
+ from pydantic import BaseModel, Field, field_validator, model_validator, ConfigDict
7
+ from datetime import datetime
8
+ from enum import Enum
9
+
10
+
11
+ class DataQuality(str, Enum):
12
+ """Data quality tier."""
13
+ GOLD = "gold" # Complete data with source quote
14
+ SILVER = "silver" # Partial data with source
15
+ BRONZE = "bronze" # Limited data or no source
16
+ ERROR = "error" # Extraction failed
17
+
18
+
19
+ class QueryMode(str, Enum):
20
+ """High-level search entrypoint modes."""
21
+ MATERIAL = "material-first"
22
+ PROPERTY = "property-first"
23
+ TASK = "task-first"
24
+
25
+
26
+ class ReviewStatus(str, Enum):
27
+ """Human review status for staged evidence."""
28
+ PENDING = "pending"
29
+ APPROVED = "approved"
30
+ REJECTED = "rejected"
31
+
32
+
33
+ class PaperSource(str, Enum):
34
+ """Paper source identifier."""
35
+ PUBMED = "pubmed"
36
+ ARXIV = "arxiv"
37
+ SEMANTIC_SCHOLAR = "s2"
38
+ MANUAL = "manual"
39
+ UNKNOWN = "unknown"
40
+
41
+
42
+ class PaperMetadata(BaseModel):
43
+ """Paper metadata from discovery."""
44
+ id: str = Field(..., description="Unique ID, format: {source}_{original_id}")
45
+ title: str
46
+ authors: List[str] = Field(default_factory=list)
47
+ year: Optional[int] = None
48
+ doi: Optional[str] = None
49
+ abstract: Optional[str] = None
50
+ venue: Optional[str] = None
51
+ citation_count: Optional[int] = None
52
+ is_open_access: Optional[bool] = None
53
+ source: PaperSource = PaperSource.UNKNOWN
54
+ url: Optional[str] = None
55
+ landing_url: Optional[str] = None
56
+ pdf_url: Optional[str] = None
57
+ pdf_path: Optional[str] = None
58
+ full_text: Optional[str] = None
59
+ match_reasons: List[str] = Field(default_factory=list)
60
+ background_status: Optional[str] = None
61
+ retrieved_at: datetime = Field(default_factory=datetime.now)
62
+
63
+ @field_validator('id')
64
+ @classmethod
65
+ def validate_id_format(cls, v: str) -> str:
66
+ """Ensure ID format is correct."""
67
+ valid_prefixes = ['pubmed_', 'arxiv_', 's2_', 'manual_']
68
+ if not any(v.startswith(p) for p in valid_prefixes):
69
+ raise ValueError(f"ID must start with one of {valid_prefixes}")
70
+ return v
71
+
72
+
73
+ class LiteratureQuerySpec(BaseModel):
74
+ """Normalized query payload used by the production literature UI."""
75
+ mode: QueryMode
76
+ user_query: str
77
+ polymer_name: Optional[str] = None
78
+ canonical_smiles: Optional[str] = None
79
+ property_key: Optional[str] = None
80
+ project_id: Optional[str] = None
81
+ top_k_extract: int = Field(default=10, ge=1, le=50)
82
+ result_limit: int = Field(default=15, ge=1, le=100)
83
+
84
+
85
+ class PaperCardResult(BaseModel):
86
+ """User-facing paper card summary."""
87
+ paper_id: str
88
+ title: str
89
+ year: Optional[int] = None
90
+ venue: Optional[str] = None
91
+ doi: Optional[str] = None
92
+ landing_url: Optional[str] = None
93
+ pdf_url: Optional[str] = None
94
+ is_open_access: bool = False
95
+ match_reasons: List[str] = Field(default_factory=list)
96
+ background_status: str = "discovered"
97
+
98
+
99
+ class LiteratureSupportSummary(BaseModel):
100
+ """Aggregated evidence coverage for a material/property view."""
101
+ matched_paper_count: int = 0
102
+ oa_paper_count: int = 0
103
+ evidence_record_count: int = 0
104
+ approved_record_count: int = 0
105
+ has_experimental_evidence: bool = False
106
+ literature_support_score: int = Field(default=0, ge=0, le=100)
107
+
108
+
109
+ class LiteratureEvidenceRecord(BaseModel):
110
+ """Production staging record for extracted literature evidence."""
111
+ id: Optional[str] = None
112
+ project_id: Optional[str] = None
113
+ paper_id: str
114
+ material_name: str
115
+ canonical_smiles: Optional[str] = None
116
+ property_key: str
117
+ raw_value: str
118
+ raw_unit: str
119
+ standardized_value: Optional[float] = None
120
+ standardized_unit: Optional[str] = None
121
+ conditions_json: Dict[str, Any] = Field(default_factory=dict)
122
+ method: Optional[str] = None
123
+ evidence_quote: str
124
+ evidence_location: Optional[str] = None
125
+ extractor_version: str
126
+ extraction_model: Optional[str] = None
127
+ extraction_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
128
+ quality_tier: DataQuality = DataQuality.BRONZE
129
+ review_status: ReviewStatus = ReviewStatus.PENDING
130
+ reviewer_note: Optional[str] = None
131
+ edited_payload_json: Optional[Dict[str, Any]] = None
132
+ created_at: Optional[str] = None
133
+ updated_at: Optional[str] = None
134
+
135
+ @field_validator("evidence_quote")
136
+ @classmethod
137
+ def validate_evidence_quote(cls, v: str) -> str:
138
+ text = str(v or "").strip()
139
+ if len(text) < 10:
140
+ raise ValueError("evidence_quote must be at least 10 characters")
141
+ return text
142
+
143
+
144
+ # ============== Experimental Conditions ==============
145
+
146
+ class ExperimentalConditions(BaseModel):
147
+ """
148
+ Experimental conditions with full context.
149
+
150
+ ⚠️ extra="allow" keeps LLM-returned fields like humidity, substrate, etc.
151
+ """
152
+ model_config = ConfigDict(extra="allow")
153
+
154
+ # Preparation conditions
155
+ solvent: Optional[str] = None
156
+ concentration_mg_ml: Optional[float] = None
157
+ spin_speed_rpm: Optional[int] = None
158
+ spin_time_s: Optional[int] = None
159
+ annealing_temp_c: Optional[float] = None
160
+ annealing_time_min: Optional[float] = None
161
+ annealing_atmosphere: Optional[str] = None
162
+ film_thickness_nm: Optional[float] = None
163
+
164
+ # Measurement conditions
165
+ measurement_temp_k: Optional[float] = Field(None, description="Measurement temperature (K)")
166
+ measurement_method: Optional[str] = None
167
+ measurement_direction: Optional[str] = None # in-plane, cross-plane
168
+
169
+ def to_dict(self) -> dict:
170
+ """Convert to dict, excluding None values."""
171
+ return {k: v for k, v in self.model_dump().items() if v is not None}
172
+
173
+
174
+ # ============== Contextualized Value ==============
175
+
176
+ class ContextualizedValue(BaseModel):
177
+ """
178
+ Measurement value with full experimental context and source traceability.
179
+
180
+ Design principles:
181
+ - Same paper may report multiple values under different conditions
182
+ - Each value MUST have its associated experimental conditions
183
+ - MANDATORY: source_quote for traceability
184
+ """
185
+ model_config = ConfigDict(extra="allow")
186
+
187
+ # Material
188
+ polymer_name: str = Field(..., description="Polymer name e.g. PEDOT:PSS")
189
+ dopant: Optional[str] = None
190
+ dopant_ratio: Optional[str] = None
191
+
192
+ # Property measured
193
+ property_name: str = Field(..., description="Property name e.g. electrical_conductivity")
194
+
195
+ # Raw value
196
+ raw_value: str = Field(..., description="Raw value string from paper")
197
+ raw_unit: str = Field(..., description="Original unit from paper")
198
+
199
+ # Standardized value (filled by Standardizer)
200
+ standardized_value: Optional[float] = None
201
+ standardized_unit: Optional[str] = None
202
+ standardization_error: Optional[str] = None
203
+
204
+ # Experimental conditions
205
+ conditions: ExperimentalConditions = Field(default_factory=ExperimentalConditions)
206
+
207
+ # Source traceability (MANDATORY!)
208
+ source_quote: str = Field(..., description="Exact quote from paper containing this value")
209
+ source_location: Optional[str] = Field(None, description="Table 1, Figure 3a, etc.")
210
+
211
+ # Quality
212
+ extraction_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
213
+ quality_tier: DataQuality = DataQuality.BRONZE
214
+
215
+ @field_validator('source_quote')
216
+ @classmethod
217
+ def quote_not_empty(cls, v: str) -> str:
218
+ if not v or len(v.strip()) < 10:
219
+ raise ValueError("source_quote must be >10 chars")
220
+ return v.strip()
221
+
222
+ def to_db_dict(self) -> dict:
223
+ """Convert to database storage format."""
224
+ return {
225
+ "polymer_name": self.polymer_name,
226
+ "dopant": self.dopant,
227
+ "dopant_ratio": self.dopant_ratio,
228
+ "property_name": self.property_name,
229
+ "raw_value": self.raw_value,
230
+ "raw_unit": self.raw_unit,
231
+ "standardized_value": self.standardized_value,
232
+ "standardized_unit": self.standardized_unit,
233
+ "conditions": self.conditions.to_dict(),
234
+ "source_quote": self.source_quote,
235
+ "source_location": self.source_location,
236
+ "extraction_confidence": self.extraction_confidence,
237
+ "quality_tier": self.quality_tier.value,
238
+ }
239
+
240
+
241
+ # ============== Legacy PolymerDataPoint (for compatibility) ==============
242
+
243
+ class PolymerDataPoint(BaseModel):
244
+ """Single data point extracted from literature (legacy format)."""
245
+ # Material Information
246
+ polymer_name: str = Field(..., description="Polymer name, e.g. P3HT, PEDOT:PSS")
247
+ polymer_class: Optional[str] = Field(None, description="Polymer class")
248
+ dopant: Optional[str] = None
249
+ dopant_ratio: Optional[str] = None
250
+
251
+ # Processing Conditions
252
+ solvent: Optional[str] = None
253
+ concentration_mg_ml: Optional[float] = None
254
+ spin_speed_rpm: Optional[int] = None
255
+ spin_time_s: Optional[int] = None
256
+ annealing_temp_c: Optional[float] = None
257
+ annealing_time_min: Optional[float] = None
258
+ annealing_atmosphere: Optional[str] = None
259
+ film_thickness_nm: Optional[float] = None
260
+
261
+ # Electrical Properties
262
+ electrical_conductivity_s_cm: Optional[float] = None
263
+ seebeck_coefficient_uv_k: Optional[float] = None
264
+ power_factor_uw_m_k2: Optional[float] = None
265
+
266
+ # Thermal Properties
267
+ thermal_conductivity_w_mk: Optional[float] = None
268
+ zt_figure_of_merit: Optional[float] = None
269
+
270
+ # Structural
271
+ xrd_crystallinity_percent: Optional[float] = None
272
+ xrd_pi_stacking_angstrom: Optional[float] = None
273
+ xrd_lamellar_spacing_angstrom: Optional[float] = None
274
+
275
+ # Metadata
276
+ source_paper_id: str
277
+ source_table_or_figure: Optional[str] = None
278
+ extraction_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
279
+ quality_tier: DataQuality = DataQuality.BRONZE
280
+ raw_text_snippet: Optional[str] = None
281
+
282
+ @field_validator('electrical_conductivity_s_cm', 'thermal_conductivity_w_mk', mode='before')
283
+ @classmethod
284
+ def validate_positive(cls, v: Any) -> Optional[float]:
285
+ if v is not None and isinstance(v, (int, float)) and v < 0:
286
+ return None
287
+ return v
288
+
289
+
290
+ # ============== Extraction Result ==============
291
+
292
+ class ExtractionResult(BaseModel):
293
+ """
294
+ Extraction result for a single paper.
295
+
296
+ Supports both old format (paper=PaperMetadata) and new format (paper_id, paper_title).
297
+ """
298
+ model_config = ConfigDict(extra="allow")
299
+
300
+ # New format fields (preferred)
301
+ paper_id: Optional[str] = None
302
+ paper_title: Optional[str] = None
303
+
304
+ # Old format field (for backward compatibility)
305
+ paper: Optional[PaperMetadata] = None
306
+
307
+ # Common fields
308
+ data_points: List = Field(default_factory=list) # Can be ContextualizedValue or PolymerDataPoint
309
+ extraction_model: str = "unknown"
310
+ extraction_timestamp: Any = Field(default_factory=lambda: datetime.now().isoformat())
311
+ success: bool = True
312
+ error_message: Optional[str] = None
313
+
314
+ # Legacy fields
315
+ llm_model_used: Optional[str] = None
316
+ extraction_notes: Optional[str] = None
317
+
318
+ @model_validator(mode='after')
319
+ def extract_paper_fields(self):
320
+ """Extract paper_id and paper_title from paper if not provided."""
321
+ if self.paper is not None:
322
+ if self.paper_id is None:
323
+ self.paper_id = self.paper.id
324
+ if self.paper_title is None:
325
+ self.paper_title = self.paper.title
326
+ # Copy llm_model_used to extraction_model if present
327
+ if self.llm_model_used and self.extraction_model == "unknown":
328
+ self.extraction_model = self.llm_model_used
329
+ return self
literature/standardizer.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit standardization for production literature evidence.
3
+
4
+ The standard units are aligned with the platform property catalog so extracted
5
+ evidence can be compared and filtered consistently before human review.
6
+ """
7
+ import logging
8
+ import re
9
+ from dataclasses import dataclass
10
+ from typing import Callable, Dict, List, Optional
11
+
12
+ from .property_registry import PROPERTY_CATALOG
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class StandardizationResult:
19
+ """Standardization result."""
20
+ success: bool
21
+ value: Optional[float] = None
22
+ unit: Optional[str] = None
23
+ error: Optional[str] = None
24
+
25
+
26
+ def normalize_minus_signs(s: str) -> str:
27
+ """Normalize all Unicode minus signs to ASCII hyphen-minus."""
28
+ minus_chars = [
29
+ "−", "–", "—", "‐", "‑", "‒", "⁻", "₋", "➖",
30
+ ]
31
+ for char in minus_chars:
32
+ s = s.replace(char, "-")
33
+ return s
34
+
35
+
36
+ def _identity(value: float) -> float:
37
+ return value
38
+
39
+
40
+ def _mul(factor: float) -> Callable[[float], float]:
41
+ return lambda value: value * factor
42
+
43
+
44
+ def _add(delta: float) -> Callable[[float], float]:
45
+ return lambda value: value + delta
46
+
47
+
48
+ class UnitStandardizer:
49
+ """Convert raw values from papers to platform-standard units."""
50
+
51
+ STANDARD_UNITS = {key: meta["unit"] for key, meta in PROPERTY_CATALOG.items()}
52
+
53
+ UNIT_ALIASES = {
54
+ # Temperature
55
+ "k": "K",
56
+ "kelvin": "K",
57
+ "c": "C",
58
+ "°c": "C",
59
+ "deg c": "C",
60
+ "celsius": "C",
61
+ # Thermal
62
+ "w/mk": "W/(m*K)",
63
+ "w/(m·k)": "W/(m*K)",
64
+ "w m-1 k-1": "W/(m*K)",
65
+ "w·m⁻¹·k⁻¹": "W/(m*K)",
66
+ "mw/(m*k)": "mW/(m*K)",
67
+ "mw/(m·k)": "mW/(m*K)",
68
+ "j/kgk": "J/(kg*K)",
69
+ "j/(kg·k)": "J/(kg*K)",
70
+ "j/(kg*k)": "J/(kg*K)",
71
+ "j/gk": "J/(g*K)",
72
+ "j/(g*k)": "J/(g*K)",
73
+ # Mechanical
74
+ "gpa": "GPa",
75
+ "mpa": "MPa",
76
+ # Transport / physical
77
+ "pa s": "Pa*s",
78
+ "pa·s": "Pa*s",
79
+ "pas": "Pa*s",
80
+ "mpa*s": "mPa*s",
81
+ "cm2/s": "cm^2/s",
82
+ "cm^2/s": "cm^2/s",
83
+ "mm2/s": "mm^2/s",
84
+ "mm^2/s": "mm^2/s",
85
+ "g/cm3": "g/cm^3",
86
+ "g/cm^3": "g/cm^3",
87
+ "kg/m3": "kg/m^3",
88
+ "kg/m^3": "kg/m^3",
89
+ "ang": "Angstrom",
90
+ "angstrom": "Angstrom",
91
+ "å": "Angstrom",
92
+ "nm": "nm",
93
+ # Electronics
94
+ "ev": "eV",
95
+ "a.u.": "a.u.",
96
+ "au": "a.u.",
97
+ "debye": "Debye",
98
+ # Gas / transport
99
+ "barrer": "Barrer",
100
+ # Extended literature properties
101
+ "s/cm": "S/cm",
102
+ "s m-1": "S/m",
103
+ "s/m": "S/m",
104
+ "uv/k": "uV/K",
105
+ "μv/k": "uV/K",
106
+ "µv/k": "uV/K",
107
+ "mv/k": "mV/K",
108
+ "uw/(m*k^2)": "uW/(m*K^2)",
109
+ "uw/(m*k**2)": "uW/(m*K^2)",
110
+ "uw/(m·k²)": "uW/(m*K^2)",
111
+ "mw/(m*k^2)": "mW/(m*K^2)",
112
+ "%": "%",
113
+ "dimensionless": "",
114
+ "-": "",
115
+ "": "",
116
+ }
117
+
118
+ CONVERSIONS: Dict[str, Dict[tuple[str, str], Callable[[float], float]]] = {
119
+ "tm": {("C", "K"): _add(273.15)},
120
+ "tg": {("C", "K"): _add(273.15)},
121
+ "cp": {("J/(g*K)", "J/(kg*K)"): _mul(1000.0)},
122
+ "tc": {("mW/(m*K)", "W/(m*K)"): _mul(0.001)},
123
+ "young": {("MPa", "GPa"): _mul(0.001)},
124
+ "shear": {("MPa", "GPa"): _mul(0.001)},
125
+ "bulk": {("MPa", "GPa"): _mul(0.001)},
126
+ "visc": {("mPa*s", "Pa*s"): _mul(0.001)},
127
+ "dif": {("mm^2/s", "cm^2/s"): _mul(0.01)},
128
+ "rho": {("kg/m^3", "g/cm^3"): _mul(0.001)},
129
+ "rg": {("nm", "Angstrom"): _mul(10.0)},
130
+ "electrical_conductivity": {("S/m", "S/cm"): _mul(0.01)},
131
+ "seebeck_coefficient": {("mV/K", "uV/K"): _mul(1000.0)},
132
+ "power_factor": {("mW/(m*K^2)", "uW/(m*K^2)"): _mul(1000.0)},
133
+ }
134
+
135
+ def standardize(
136
+ self,
137
+ property_name: str,
138
+ raw_value: str,
139
+ raw_unit: str,
140
+ ) -> StandardizationResult:
141
+ try:
142
+ numeric = self._parse_numeric(raw_value)
143
+ except ValueError as exc:
144
+ return StandardizationResult(success=False, error=f"Parse error: {exc}")
145
+
146
+ standard_unit = self.STANDARD_UNITS.get(property_name)
147
+ if standard_unit is None:
148
+ return StandardizationResult(success=False, error=f"Unknown property: {property_name}")
149
+
150
+ normalized = self._normalize_unit(raw_unit)
151
+ if standard_unit in {"dimensionless", ""}:
152
+ return StandardizationResult(success=True, value=numeric, unit="")
153
+
154
+ if normalized == standard_unit:
155
+ return StandardizationResult(success=True, value=numeric, unit=standard_unit)
156
+
157
+ transform = self.CONVERSIONS.get(property_name, {}).get((normalized, standard_unit))
158
+ if transform is not None:
159
+ return StandardizationResult(success=True, value=transform(numeric), unit=standard_unit)
160
+
161
+ if normalized == "":
162
+ return StandardizationResult(success=False, error=f"Missing unit for {property_name}")
163
+
164
+ return StandardizationResult(
165
+ success=False,
166
+ error=f"Cannot convert {normalized} to {standard_unit} for {property_name}",
167
+ )
168
+
169
+ def _parse_numeric(self, value_str: str) -> float:
170
+ s = normalize_minus_signs(str(value_str or "").strip())
171
+ s = re.sub(r"\s*[×x]\s*10\^?\s*(-?\d+)", r"e\1", s)
172
+ superscripts = {
173
+ "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4",
174
+ "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "⁻": "-",
175
+ }
176
+ for sup, norm in superscripts.items():
177
+ s = s.replace(sup, norm)
178
+ s = s.replace(" ", "")
179
+
180
+ range_match = re.match(r"^(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)$", s)
181
+ if range_match:
182
+ low = float(range_match.group(1))
183
+ high = float(range_match.group(2))
184
+ return (low + high) / 2
185
+
186
+ pm_match = re.match(r"^([\d.eE+-]+)\s*[±]\s*[\d.eE+-]+$", s)
187
+ if pm_match:
188
+ return float(pm_match.group(1))
189
+
190
+ return float(s)
191
+
192
+ def _normalize_unit(self, unit: str) -> str:
193
+ normalized = normalize_minus_signs(str(unit or "").strip())
194
+ normalized = normalized.replace("²", "^2").replace("³", "^3")
195
+ normalized = normalized.replace("·", "*").replace(" ", " ")
196
+ key = re.sub(r"\s+", " ", normalized.lower()).strip()
197
+ return self.UNIT_ALIASES.get(key, normalized)
198
+
199
+ def standardize_data_points(self, data_points: List) -> List:
200
+ for dp in data_points:
201
+ result = self.standardize(
202
+ property_name=dp.property_name,
203
+ raw_value=dp.raw_value,
204
+ raw_unit=dp.raw_unit,
205
+ )
206
+ if result.success:
207
+ dp.standardized_value = result.value
208
+ dp.standardized_unit = result.unit
209
+ else:
210
+ dp.standardization_error = result.error
211
+ return data_points
scripts/__pycache__/run_literature_mining.cpython-313.pyc ADDED
Binary file (7.79 kB). View file
 
scripts/evaluate_polyie.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from pathlib import Path
7
+
8
+ from literature.evaluation import evaluate_predictions, load_json_records
9
+
10
+
11
+ def main() -> None:
12
+ parser = argparse.ArgumentParser(description="Evaluate extraction output against a POLYIE-style gold file.")
13
+ parser.add_argument("--gold", required=True, help="Gold file (.json or .jsonl)")
14
+ parser.add_argument("--pred", required=True, help="Prediction file (.json or .jsonl)")
15
+ parser.add_argument("--out", default=None, help="Optional JSON output path")
16
+ args = parser.parse_args()
17
+
18
+ gold_records = load_json_records(args.gold)
19
+ predicted_records = load_json_records(args.pred)
20
+ metrics = evaluate_predictions(gold_records, predicted_records)
21
+ text = json.dumps(metrics, indent=2, ensure_ascii=False)
22
+ print(text)
23
+
24
+ if args.out:
25
+ Path(args.out).write_text(text + "\n", encoding="utf-8")
26
+
27
+
28
+ if __name__ == "__main__":
29
+ main()
scripts/run_literature_mining.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Project-based literature mining CLI.
4
+
5
+ Examples:
6
+ python scripts/run_literature_mining.py --query "PEDOT:PSS thermoelectric" --limit 5
7
+ python scripts/run_literature_mining.py --project-id proj_xxx --query "P3HT conductivity" --save-mode files
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import csv
13
+ import json
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List
16
+
17
+ from dotenv import load_dotenv
18
+
19
+ from src.literature_service import (
20
+ DataPointRepo,
21
+ LiteraturePipeline,
22
+ ProjectRepo,
23
+ QueryIntentService,
24
+ QuerySessionRepo,
25
+ get_database,
26
+ )
27
+
28
+ load_dotenv()
29
+
30
+
31
+ def resolve_project_id(project_id: str | None, projects: ProjectRepo) -> str:
32
+ if project_id:
33
+ project = projects.get_project(project_id)
34
+ if not project:
35
+ raise ValueError(f"Project not found: {project_id}")
36
+ return project_id
37
+
38
+ existing = projects.list_projects()
39
+ if existing:
40
+ return existing[0]["id"]
41
+
42
+ created = projects.create_project(
43
+ name="Default Literature Project",
44
+ description="Auto-created by run_literature_mining.py",
45
+ )
46
+ return created["id"]
47
+
48
+
49
+ def export_points_to_files(project_id: str, points: List[Dict[str, Any]], out_dir: Path) -> None:
50
+ out_dir.mkdir(parents=True, exist_ok=True)
51
+
52
+ jsonl_path = out_dir / "validated_points.jsonl"
53
+ with jsonl_path.open("w", encoding="utf-8") as f:
54
+ for row in points:
55
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
56
+
57
+ csv_path = out_dir / "validated_points.csv"
58
+ if points:
59
+ with csv_path.open("w", newline="", encoding="utf-8") as f:
60
+ writer = csv.DictWriter(f, fieldnames=list(points[0].keys()))
61
+ writer.writeheader()
62
+ writer.writerows(points)
63
+ else:
64
+ csv_path.write_text("point_id,project_id\n", encoding="utf-8")
65
+
66
+ print(f"Exported {len(points)} rows to:")
67
+ print(f" - {jsonl_path}")
68
+ print(f" - {csv_path}")
69
+
70
+
71
+ def main() -> None:
72
+ parser = argparse.ArgumentParser(description="Project-based Literature Mining CLI")
73
+ parser.add_argument("--project-id", default=None, help="Target project ID")
74
+ parser.add_argument("--query", default="PEDOT:PSS thermoelectric conductivity", help="Search query")
75
+ parser.add_argument("--limit", type=int, default=5, help="Max papers per source")
76
+ parser.add_argument("--strategy", choices=["simple", "paperqa"], default="simple", help="Extraction strategy")
77
+ parser.add_argument("--model-provider", default="openai_compatible", help="Model provider name")
78
+ parser.add_argument("--model-name", default="gpt-oss:latest", help="Model name")
79
+ parser.add_argument("--save-mode", choices=["sqlite", "files"], default="sqlite", help="Result sink mode")
80
+ parser.add_argument("--no-save", action="store_true", help="Do not persist result to sqlite")
81
+ parser.add_argument("--manual-upload-dir", default="data/literature/manual_uploads", help="Reserved for batch manual upload")
82
+ args = parser.parse_args()
83
+
84
+ db = get_database("data/app.db")
85
+ project_repo = ProjectRepo(db)
86
+ point_repo = DataPointRepo(db)
87
+ query_repo = QuerySessionRepo(db)
88
+ query_intent = QueryIntentService(query_repo)
89
+ pipeline = LiteraturePipeline(db_path="data/app.db")
90
+
91
+ target_project_id = resolve_project_id(args.project_id, project_repo)
92
+ project = project_repo.get_project(target_project_id)
93
+ print("=" * 64)
94
+ print("Project-Based Literature Mining")
95
+ print(f"Project: {project['name']} ({target_project_id})")
96
+ print(f"Query: {args.query}")
97
+ print(f"Limit per source: {args.limit}")
98
+ print(f"Strategy: {args.strategy}")
99
+ print("=" * 64)
100
+
101
+ query_session = query_intent.analyze_and_store(target_project_id, args.query)
102
+ suggestions = json.loads(query_session.get("suggestions_json") or "[]")
103
+ if suggestions:
104
+ print("Query suggestions:")
105
+ for s in suggestions:
106
+ print(f" - {s}")
107
+ if query_session.get("clarification_required"):
108
+ print("Note: query marked as pending_clarification. Continuing by CLI override.")
109
+
110
+ if args.no_save:
111
+ discovered = pipeline.run_discovery(target_project_id, args.query, args.limit)
112
+ retrieved = pipeline.run_retrieval(target_project_id, discovered)
113
+ stats = pipeline.run_extraction(
114
+ target_project_id,
115
+ run_id=None,
116
+ paper_rows=retrieved,
117
+ strategy=args.strategy,
118
+ model_name=args.model_name,
119
+ use_full_text=True,
120
+ )
121
+ print(f"Extraction complete without DB run record: {stats}")
122
+ else:
123
+ result = pipeline.run_full_pipeline(
124
+ project_id=target_project_id,
125
+ query=args.query,
126
+ limit=args.limit,
127
+ strategy=args.strategy,
128
+ model_provider=args.model_provider,
129
+ model_name=args.model_name,
130
+ use_full_text=True,
131
+ )
132
+ print(f"Pipeline status: {result.get('status')}")
133
+ if result.get("status") != "completed":
134
+ print(f"Error: {result.get('error')}")
135
+ else:
136
+ print(json.dumps(result.get("stats", {}), indent=2))
137
+
138
+ points = point_repo.list_points(target_project_id)
139
+ if args.save_mode == "files":
140
+ run_dir = Path("data/literature/runs")
141
+ export_points_to_files(target_project_id, points, run_dir)
142
+
143
+ print("=" * 64)
144
+ print("Done.")
145
+ print("=" * 64)
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()
scripts/train_prior_slurm.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=polymer_prior
3
+ #SBATCH --nodes=1
4
+ #SBATCH --gres=gpu:4
5
+ #SBATCH --cpus-per-task=16
6
+ #SBATCH --mem=64G
7
+ #SBATCH --time=24:00:00
8
+ #SBATCH --output=logs/train_prior_%j.out
9
+ #SBATCH --error=logs/train_prior_%j.err
10
+
11
+ set -euo pipefail
12
+
13
+ # Adjust these for your CRC environment
14
+ REPO_DIR="/Users/xuguoyue/Documents/GitHub/POLYMER-PROPERTY"
15
+ VENV_DIR="$REPO_DIR/.venv"
16
+
17
+ cd "$REPO_DIR"
18
+
19
+ # Load modules if your CRC requires it (example)
20
+ # module load python/3.10
21
+
22
+ source "$VENV_DIR/bin/activate"
23
+
24
+ mkdir -p logs
25
+
26
+ export OMP_NUM_THREADS=8
27
+ export MKL_NUM_THREADS=8
28
+
29
+ torchrun --nproc_per_node=4 RNN/train_prior.py \
30
+ --smiles-csv data/PI1M.csv \
31
+ --vocab RNN/pretrained_model/voc \
32
+ --output RNN/pretrained_model/Prior.ckpt \
33
+ --epochs 10 \
34
+ --batch-size 256 \
35
+ --lr 1e-3 \
36
+ --max-length 140 \
37
+ --num-workers 4 \
38
+ --log-every 200
src/.DS_Store ADDED
Binary file (10.2 kB). View file
 
src/__pycache__/conv.cpython-310.pyc ADDED
Binary file (7.21 kB). View file
 
src/__pycache__/conv.cpython-313.pyc ADDED
Binary file (11.2 kB). View file
 
src/__pycache__/data_builder.cpython-310.pyc ADDED
Binary file (24 kB). View file
 
src/__pycache__/data_builder.cpython-313.pyc ADDED
Binary file (40.6 kB). View file
 
src/__pycache__/discover_llm.cpython-310.pyc ADDED
Binary file (23 kB). View file
 
src/__pycache__/discover_llm.cpython-313.pyc ADDED
Binary file (37.7 kB). View file
 
src/__pycache__/discovery.cpython-310.pyc ADDED
Binary file (21 kB). View file
 
src/__pycache__/discovery.cpython-313.pyc ADDED
Binary file (34.6 kB). View file