File size: 6,619 Bytes
7b161f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""

LLM-based datasheet parser using HuggingFace Inference API (LLaMA 3.1).



Takes raw web content or uploaded text and extracts structured polymer

datasheet properties.

"""

from __future__ import annotations

import json
import logging
import re
from typing import Optional

from huggingface_hub import InferenceClient

import config
from models import DatasheetRecord

logger = logging.getLogger(__name__)

# ── System prompt for structured extraction ──────────────────────────────────

SYSTEM_PROMPT = """\

You are an expert polymer materials scientist and data extraction specialist.

Your task is to extract technical datasheet properties from the provided raw text

and return them as a JSON object.



RULES:

1. Extract ONLY information explicitly stated in the source text.

2. If a property is not found, leave the value as an empty string "".

3. Include units in the value where available (e.g., "65 MPa", "1.14 g/cmΒ³").

4. For properties with ranges, format as "min - max unit" (e.g., "220 - 260 Β°C").

5. If multiple grades/variants exist, pick the one that best matches the query.

6. Return ONLY valid JSON β€” no markdown, no extra text, no code blocks.



Return a JSON object with exactly these keys:



{

  "material_name": "",

  "trade_name": "",

  "manufacturer": "",

  "polymer_family": "",

  "grade": "",

  "description": "",

  "processing_method": "",

  "features": "",

  "applications": "",

  "tensile_strength_mpa": "",

  "tensile_modulus_mpa": "",

  "elongation_at_break_pct": "",

  "flexural_strength_mpa": "",

  "flexural_modulus_mpa": "",

  "impact_strength_charpy_kj_m2": "",

  "impact_strength_izod_j_m": "",

  "hardness_shore_d": "",

  "hardness_rockwell": "",

  "compressive_strength_mpa": "",

  "melting_temperature_c": "",

  "glass_transition_temperature_c": "",

  "heat_deflection_temperature_c": "",

  "vicat_softening_temperature_c": "",

  "continuous_service_temperature_c": "",

  "thermal_conductivity_w_mk": "",

  "coefficient_of_thermal_expansion_um_mk": "",

  "flammability_rating": "",

  "density_g_cm3": "",

  "melt_flow_index_g_10min": "",

  "water_absorption_pct": "",

  "moisture_absorption_pct": "",

  "specific_gravity": "",

  "transparency": "",

  "color": "",

  "dielectric_strength_kv_mm": "",

  "dielectric_constant": "",

  "volume_resistivity_ohm_cm": "",

  "surface_resistivity_ohm": "",

  "dissipation_factor": "",

  "acid_resistance": "",

  "alkali_resistance": "",

  "solvent_resistance": "",

  "uv_resistance": "",

  "weatherability": "",

  "fda_approved": "",

  "rohs_compliant": "",

  "reach_compliant": "",

  "ul94_rating": ""

}

"""


def parse_datasheet(

    raw_content: str,

    manufacturer: str = "",

    polymer_family: str = "",

    grade: str = "",

    source_url: str = "",

) -> tuple[Optional[DatasheetRecord], list[str]]:
    """

    Send raw content to LLaMA 3.1 via HuggingFace Inference API and

    parse the response into a DatasheetRecord.



    Returns (record, errors).

    """
    errors: list[str] = []

    if not raw_content.strip():
        errors.append("No raw content to parse.")
        return None, errors

    # Build the user prompt
    context_hint = ""
    if manufacturer or polymer_family or grade:
        context_hint = (
            f"\nThe user is looking for: Manufacturer={manufacturer}, "
            f"Polymer Family={polymer_family}, Grade={grade}.\n"
            "Focus extraction on this specific material.\n"
        )

    user_prompt = (
        f"{context_hint}\n"
        f"Extract the polymer datasheet properties from the following raw text:\n\n"
        f"{raw_content}"
    )

    # Call HuggingFace Inference API
    try:
        client = InferenceClient(
            model=config.HF_MODEL_ID,
            token=config.HF_TOKEN,
        )

        response = client.chat_completion(
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=config.LLM_MAX_NEW_TOKENS,
            temperature=config.LLM_TEMPERATURE,
        )

        raw_response = response.choices[0].message.content
        logger.info("LLM response length: %d chars", len(raw_response))

    except Exception as exc:
        errors.append(f"LLM inference failed: {exc}")
        logger.error("LLM inference failed: %s", exc)
        return None, errors

    # Parse JSON from response
    record = _extract_json_to_record(raw_response, source_url, errors)
    return record, errors


def _extract_json_to_record(

    raw_response: str,

    source_url: str,

    errors: list[str],

) -> Optional[DatasheetRecord]:
    """

    Extract JSON from the LLM response (handles markdown code blocks)

    and convert to a DatasheetRecord.

    """
    # Try to find JSON in the response
    json_str = raw_response.strip()

    # Remove markdown code block wrappers if present
    code_block_match = re.search(
        r"```(?:json)?\s*\n?(.*?)\n?```", json_str, re.DOTALL
    )
    if code_block_match:
        json_str = code_block_match.group(1).strip()

    # Try to find a JSON object
    brace_match = re.search(r"\{.*\}", json_str, re.DOTALL)
    if brace_match:
        json_str = brace_match.group(0)

    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as exc:
        errors.append(f"Failed to parse JSON from LLM response: {exc}")
        logger.error("JSON parse error: %s\nRaw response:\n%s", exc, raw_response[:500])
        return None

    if not isinstance(data, dict):
        errors.append("LLM response is not a JSON object.")
        return None

    # Set source URL
    data["source_url"] = source_url

    # Build DatasheetRecord, ignoring unknown fields
    valid_fields = set(DatasheetRecord.model_fields.keys())
    filtered = {k: str(v) for k, v in data.items() if k in valid_fields}

    try:
        record = DatasheetRecord(**filtered)
        return record
    except Exception as exc:
        errors.append(f"Failed to create DatasheetRecord: {exc}")
        return None


def parse_uploaded_text(

    text: str,

    source_label: str = "user_upload",

) -> tuple[Optional[DatasheetRecord], list[str]]:
    """

    Parse a user-uploaded datasheet text (e.g., from PDF extraction).

    """
    return parse_datasheet(
        raw_content=text,
        source_url=source_label,
    )