omnigreen-api / extractor.py
DrMO2487's picture
Upload 18 files
5bf896a verified
import json
import PyPDF2
from typing import Dict, Any
# ─── CRITICAL UNIT RULES (used in prompt) ────────────────────────────────────
# All numeric amounts MUST be reported in grams (g) or milliliters (mL).
# ─── SHARED ARTICLE SUMMARY SCHEMA ───────────────────────────────────────────
_SUMMARY_SCHEMA = '''"article_summary": {
"analytes": ["list ALL analytes/compounds being determined. E.g. Metformin, Sitagliptin"],
"matrices": ["list ALL sample matrices. E.g. Tablet, Plasma, Urine, Serum, Saliva, Water, Food, Soil"],
"techniques": ["list ALL techniques MENTIONED anywhere in the paper (including literature review)"],
"developed_techniques": ["list ONLY the technique(s) that were DEVELOPED, VALIDATED, or EXPERIMENTALLY APPLIED in THIS paper. These must have an Experimental section, instrumentation setup, validation data (LOD/LOQ/precision/recovery) or sample preparation described in detail. Do NOT include techniques only cited or compared from literature."],
"quantification_method": "External Standard | Standard Addition | Internal Standard | Calibration Curve",
"validation_status": "Fully Validated | Partially Validated | Not Validated",
"key_findings": "1-2 sentence summary of the paper purpose and conclusion"
},'''
_SUMMARY_INSTRUCTIONS = """
══════════════════════════════════════════════
ARTICLE SUMMARY EXTRACTION RULES:
- analytes: EVERY compound measured. Check title, abstract, intro, conclusions.
- matrices: ALL biological/pharmaceutical/environmental matrices.
- techniques (all-mentioned): ALL instruments/techniques cited anywhere β€” Intro, Methods, Results, Discussion.
- developed_techniques (KEY FIELD β€” read carefully):
βœ“ INCLUDE a technique if the paper contains:
β€’ A dedicated experimental/methods subsection describing the instrument setup for it
β€’ Validation data specific to it (linearity, LOD, LOQ, precision, accuracy, recovery, robustness)
β€’ Sample-preparation steps designed for it
β€’ Chromatographic/spectral conditions (column, mobile phase, wavelength, gradient, etc.)
βœ— EXCLUDE a technique if it is only:
β€’ Mentioned in Introduction or Literature Review as a comparison or background reference
β€’ Cited as β€œpreviously reported by [author]” or β€œother studies used X”
β€’ Referenced for comparison in a table without being experimentally performed in this paper
- validation_status: Look for β€œICH”, β€œUSP”, β€œlinearity”, β€œLOD”, β€œLOQ”, β€œprecision”, β€œaccuracy”, β€œrecovery”.
══════════════════════════════════════════════
"""
# ─── LIGHTWEIGHT TECHNIQUE SCANNER (Phase 1 only) ─────────────────────────────
SCAN_TECHNIQUES_PROMPT = """You are an expert analytical chemistry assistant reviewing a scientific article.
Your SOLE task is to identify which analytical technique(s) were actually DEVELOPED, OPTIMISED
or VALIDATED in this paper β€” as distinct from those merely cited in the literature review.
""" + _SUMMARY_INSTRUCTIONS + """
CRITICAL CLASSIFICATION RULES FOR 'developed_techniques':
================================================
A technique belongs in 'developed_techniques' ONLY if the paper contains AT LEAST ONE of:
βœ“ A dedicated Experimental/Materials & Methods subsection describing instrument setup for it
βœ“ Specific chromatographic/spectral conditions (column, mobile phase, wavelength, temperature,
gradient, flow rate, detector settings, etc.) for it
βœ“ Validation results generated in this study (linearity, LOD, LOQ, precision, accuracy,
recovery, robustness, stability)
βœ“ Sample preparation steps specifically designed around it
βœ“ New application of the technique to a matrix or compound presented as the paper\'s
original contribution
A technique must NOT be in 'developed_techniques' if it only appears as:
βœ— A reference cited in the Introduction or Literature Review ("Smith et al. used HPLC-UV...")
βœ— A technique listed in a comparison table copied from literature without being run here
βœ— General background or motivation text
βœ— A technique applied by a previous author that this paper is replacing or improving upon
================================================
IMPORTANT: A single paper may develop MORE THAN ONE technique (e.g. both HPLC-UV and
spectrophotometry). List ALL techniques that qualify by the rules above.
Return ONLY a valid JSON object:
{
"article_summary": {
"analytes": ["ALL analytes measured in this study"],
"matrices": ["ALL sample matrices used in this study"],
"techniques": ["ALL techniques MENTIONED anywhere in the paper (intro, methods, results)"],
"developed_techniques": [
"ONLY techniques with experimental setup/validation in THIS paper β€” apply the rules above.",
"Be SPECIFIC: HPLC-UV not just HPLC. GC-FID not just GC."
],
"quantification_method": "External Standard | Standard Addition | Internal Standard | Calibration Curve",
"validation_status": "Fully Validated | Partially Validated | Not Validated",
"key_findings": "1-2 sentence summary of the paper purpose and conclusion"
}
}
Document Text:
================
"""
# ─── EVIDENCE EXTENSION appended to every prompt before Document Text ─────────
_EVIDENCE_EXTENSION = """
══════════════════════════════════════════════
EVIDENCE REQUIREMENT β€” YOU MUST ALSO RETURN "_evidence":
For EVERY parameter key you return in the JSON above, also include a matching entry
in an "_evidence" object with TWO fields:
"quote": copy the EXACT sentence(s) from the document text that led to your answer
(max 2 sentences, verbatim from the text)
"reasoning": 1-2 sentences explaining HOW you interpreted the quote to arrive at the value
IMPORTANT: YOU MUST USE THE EXACT PARAMETER KEYS FROM YOUR SPECIFIC EXTRACTION TASK!
Do NOT use dummy keys. For example, if you extracted a parameter named "p1_type_of_analysis",
your _evidence block MUST contain "p1_type_of_analysis".
Bad example (too vague):
"_evidence": { "some_parameter_key": { "quote": "HPLC was used", "reasoning": "It's HPLC" } }
Good example (specific, verbatim, explains the logic, using the exact key you extracted):
"_evidence": {
"YOUR_EXACT_PARAMETER_KEY_HERE": {
"quote": "Chromatographic separation was performed on a Waters Acquity UPLC H-Class system with a BEH C18 column at 1.0 mL/min.",
"reasoning": "The system described is a UPLC instrument. The flow rate and BEH column are characteristic of UPLC, not standard HPLC."
}
}
Include _evidence for ALL parameter keys (article_summary fields + every scoring parameter).
For article_summary sub-fields (analytes, matrices, techniques), use the quote from the abstract or title.
For boolean fields, the quote should show the text that CONFIRMED or DENIED the feature.
══════════════════════════════════════════════
"""
EXTRACTION_PROMPT = """You are an expert analytical chemistry assistant. Extract green chemistry parameters AND article metadata from the following analytical chemistry paper.
══════════════════════════════════════════════
UNIT CONVERSION RULES β€” APPLY BEFORE WRITING JSON:
¡g→g: /1,000,000 | mg→g: /1,000 | kg→g: ×1,000
¡L→mL: /1,000 | nL→mL: /1,000,000 | L→mL: ×1,000
Waste (p7): SUM ALL solvents+reagents+mobile phase per run.
Toxicity (p11): SUM only hazardous solvents (MeCN, MeOH, DCM, CHCl3, hexane, toluene, ethyl acetate).
══════════════════════════════════════════════
SMART EXTRACTION GUIDANCE:
- p1 (Sample treatment): Look for "in-line","on-line","at-line","off-line","remote sensing","non-invasive","direct analysis".
- p2 (Sample amount): "sample weight","sample volume","aliquot","injection volume" in mg/Β΅g/Β΅L/g/mL.
- p3 (Device positioning): Classify device relationship to process: in-line/at-line/off-line.
- p4 (Steps): Count each: weighing, dissolving, filtration, centrifugation, SPE, LLE, dilution, derivatization, evaporation, reconstitution, sonication.
- p5 (Automation): "automated","autosampler","FIA","robotic","manual","semi-automated". Miniaturized: "micro","nano","chip","capillary","Β΅L volumes".
- p6 (Derivatization): "derivatization","pre-column reaction","post-column reaction","fluorescent tag","ion pair","labeling agent".
- p7 (Waste): For LC/HPLC: flow_rate_mL_per_min Γ— run_time_min. For batch: sum all solvent volumes.
- p8 (Throughput): Count target analytes. Runs/hour = 60/analysis_time_minutes.
- p9 (Technique): Identify primary from: HPLC, UPLC, GC, GC-MS, LC-MS, LC-MS/MS, HPTLC, TLC, UV-Vis, Fluorimetry, Potentiometry, Electrochemistry, AAS, ICP-MS, NMR, FTIR, NIR.
- p10 (Bio-based): "green solvents","bio-based","ethanol","water","CO2","deep eutectic solvents","ionic liquids".
- p11 (Toxic): acetonitrile, methanol, chloroform, dichloromethane, hexane, toluene, ethyl acetate, acetone.
- p12 (GHS threats 0-7): harmful to aquatic=1, bioaccumulative=1, persistent=1, flammable=1, oxidizing=1, explosive=1, corrosive=1.
""" + _SUMMARY_INSTRUCTIONS + """
Return ONLY a valid JSON object:
{
""" + _SUMMARY_SCHEMA + """
"p1_option": "one of: Remote sensing without sample damage | Remote sensing with little physical damage | Non-invasive analysis | In-field sampling and direct analysis | In-field sampling and on-line analysis | On-line analysis | At-line analysis | Off-line analysis | External sample pre- and treatment and batch analysis (reduced number of steps) | External sample pre- and treatment and batch analysis (large number of steps)",
"p2_amount": <float in g or mL>,
"p3_option": "one of: in-line | on-line | at-line | off-line",
"p4_steps": <integer>,
"p5_automation": "one of: automatic | semi-automatic | manual",
"p5_miniaturized": <true/false>,
"p6_has_derivatization": <true/false>,
"p6_agents_cas": ["list of CAS strings"],
"p7_waste_amount": <float mL>,
"p8_analytes_per_run": <integer>,
"p8_runs_per_hour": <float>,
"p9_technique": "one of: None | FTIR | Hot plate solvent evaporation (<10 min) | Rotary evaporation | Needle evaporation | Ultrasound-assisted extraction | SPE and SPME | Microbiological assays | Immunoassay | Spectrofluorometry | Titration | UPLC | UV-Vis Spectrometry | Energy dispersive X-ray fluorescence | Potentiometry | Non-instrumental detection | Hot plate solvent evaporation (10-150 min) | Accelerated solvent extraction | Supercritical fluid extraction | Microwave assisted extraction | Flame atomic absorption spectrometry | Electrothermal atomic absorption spectrometry | GC | ICP-MS | ICP-OES | LC | Hot plate solvent evaporation (>150 min) | Soxhlet extraction | NMR | GC-MS | LC-MS | X-ray diffractometry",
"p10_biobased_status": "one of: No reagents | All reagents are bio-based | Some reagents are bio-based | None of the reagents are from bio-based sources",
"p11_has_toxic_reagents": <true/false>,
"p11_toxic_amount": <float mL>,
"p12_threats_count": <integer 0-7>,
"_evidence": { "p1_option": {"quote": "...", "reasoning": "..."}, ... }
}
""" + _EVIDENCE_EXTENSION + """
Document Text:
================
"""
NQS_PROMPT = """You are an expert analytical chemistry assistant. Extract NQS (Need, Quality, and Sustainability) parameters AND article metadata from the following analytical chemistry paper.
""" + _SUMMARY_INSTRUCTIONS + """
Return ONLY a valid JSON object:
{
""" + _SUMMARY_SCHEMA + """
"need_tier": <integer 1-4: 4=simple/wasteless, 3=automated/high-throughput, 2=real-time/trained personnel, 1=high-performance/large-consumption>,
"r1": <float 0-100: Scope β€” analytes count, matrices covered>,
"r2": <float 0-100: LOD/LOQ sensitivity β€” ng/mL level = high score>,
"r3": <float 0-100: Precision β€” RSD<2%=90-100, 2-5%=70-90, >5%=<70>,
"r4": <float 0-100: Accuracy/Recovery β€” 98-102%=95-100, 95-105%=80-95, other=<70>,
"g1": <float 0-100: Toxicity safety β€” 100=no hazardous reagents>,
"g2": <float 0-100: Waste amount β€” 100=minimal waste>,
"g3": <float 0-100: Energy β€” 100=ambient temp no heating>,
"g4": <float 0-100: Direct operator safety β€” 100=closed system no vapour>,
"b1": <float 0-100: Cost β€” 100=cheapest reagents/equipment>,
"b2": <float 0-100: Time β€” 100=fastest analysis>,
"b3": <float 0-100: Infrastructure β€” 100=bench-top no specialist facility>,
"b4": <float 0-100: Simplicity β€” 100=fewest steps minimal training>,
"sdg_1": <true/false: No Poverty β€” affordable diagnostics>,
"sdg_2": <true/false: Zero Hunger β€” food safety>,
"sdg_3": <true/false: Good Health β€” pharmaceutical/clinical/biomedical>,
"sdg_4": <true/false: Quality Education>,
"sdg_5": <true/false: Gender Equality>,
"sdg_6": <true/false: Clean Water β€” water quality/environmental>,
"sdg_7": <true/false: Clean Energy>,
"sdg_8": <true/false: Economic Growth>,
"sdg_9": <true/false: Innovation β€” novel methodology>,
"sdg_10": <true/false: Reduced Inequality>,
"sdg_11": <true/false: Sustainable Communities β€” environmental monitoring>,
"sdg_12": <true/false: Responsible Consumption β€” green chemistry>,
"sdg_13": <true/false: Climate Action β€” environmental impact>,
"sdg_14": <true/false: Life Below Water β€” aquatic contamination>,
"sdg_15": <true/false: Life on Land β€” soil/plant contamination>,
"sdg_16": <true/false: Peace and Justice β€” forensic/regulatory>,
"sdg_17": <true/false: Partnerships β€” multi-institution study>,
"_evidence": { "need_tier": {"quote": "...", "reasoning": "..."}, ... }
}
""" + _EVIDENCE_EXTENSION + """
Document Text:
================
"""
COMPLEXMOGAPI_PROMPT = """You are an expert analytical chemistry assistant. Extract ComplexMoGAPI parameters AND article metadata from the following paper.
""" + _SUMMARY_INSTRUCTIONS + """
SMART EXTRACTION FOR ComplexMoGAPI:
PRE-ANALYSIS STAGE:
- pre_yield: yield/recovery % in pre-analytical step. >89%=0, 70-89%=1, <70%=2. (Use 3 if Not Applicable)
- pre_temp_time: room temp <1h=0, room temp >1h OR heating <1h=1, heating >1h OR <0Β°C=2. (Use 3 if Not Applicable)
- pre_green_economy: count green chemistry rules (atom economy, catalysis, renewable feedstocks). 5-6=0, 3-4=1, 1-2=2.
- pre_health_hazard / pre_safety_hazard: NFPA/GHS of pre-analytical reagents. 0-1=0, 2-3=1, 4=2.
- pre_instrument: common bench=0, semi-advanced=1, autoclave/glove box/high pressure=2.
- pre_energy: ≀0.1kWh=0, ≀1.5kWh=1, >1.5kWh=2. Estimate from heating time Γ— temperature Γ— equipment.
- pre_occupational: hermetized=0, partial=1, vapours to atmosphere=2.
- pre_workup: none/simple=0, standard (filtration/washing)=1, advanced (chromatography/recrystallization)=2. (Use 3 if Not Applicable)
- pre_purity: >98%=0, 97-98%=1, <97%=2. (Use 3 if Not Applicable)
ANALYTICAL STAGE:
- an_collection: in-line (no sampling)=0, on/at-line=1, off-line (lab batch)=2.
- an_preservation: none=0, chemical/physical (freezing/acidification)=1, physicochemical=2.
- an_transport: none=0, required=1, not applicable=2.
- an_storage: room temp=0, refrigeration=1, special (controlled atmosphere/dark/frozen)=2.
- an_method_type: direct analysis no prep=0, simple (dissolution/filtration)=1, extraction required=2.
- an_extraction_scale: nano (<1Β΅L)=0, micro (1-1000Β΅L)=1, macro (>1mL)=2. (Use 3 if Not Applicable)
- an_solvents_type: solvent-free=0, green (water/ethanol)=1, non-green (MeCN/MeOH/DCM)=2.
- an_additional_treatments: none=0, simple (clean-up/evaporation)=1, advanced (derivatization/mineralization)=2.
- an_amount: <10mL=0, 10-100mL=1, >100mL=2.
- an_health_hazard / an_safety_hazard: NFPA/GHS analytical reagents.
- an_energy: UV ~0.05kWh=0, HPLC ~0.5kWh=1, GC-MS/ICP ~1kWh=2.
- an_occupational: sealed automated=0, partial=1, open solvent handling=2.
- an_waste: <1mL=0, 1-10mL=1, >10mL=2.
- an_waste_treatment: recycling/distillation=0, degradation/passivation=1, drain/no treatment=2.
0=Green, 1=Yellow, 2=Red, 3=Not Applicable (only for allowed parameters).
Return ONLY a valid JSON object:
{
""" + _SUMMARY_SCHEMA + """
"pre_yield": <0|1|2|3>,
"pre_temp_time": <0|1|2|3>,
"pre_green_economy": <0|1|2>,
"pre_health_hazard": <0|1|2>,
"pre_safety_hazard": <0|1|2>,
"pre_instrument": <0|1|2>,
"pre_energy": <0|1|2>,
"pre_occupational": <0|1|2>,
"pre_workup": <0|1|2|3>,
"pre_purity": <0|1|2|3>,
"an_collection": <0|1|2>,
"an_preservation": <0|1|2>,
"an_transport": <0|1|2>,
"an_storage": <0|1|2>,
"an_method_type": <0|1|2>,
"an_extraction_scale": <0|1|2|3>,
"an_solvents_type": <0|1|2>,
"an_additional_treatments": <0|1|2>,
"an_amount": <0|1|2>,
"an_health_hazard": <0|1|2>,
"an_safety_hazard": <0|1|2>,
"an_energy": <0|1|2>,
"an_occupational": <0|1|2>,
"an_waste": <0|1|2>,
"an_waste_treatment": <0|1|2>,
"_evidence": { "pre_yield": {"quote": "...", "reasoning": "..."}, ... }
}
""" + _EVIDENCE_EXTENSION + """
Document Text:
================
"""
ECOSCALE_PROMPT = """You are an expert analytical chemistry assistant. Extract Analytical Eco-Scale parameters AND article metadata from the following paper.
""" + _SUMMARY_INSTRUCTIONS + """
SMART EXTRACTION FOR Analytical Eco-Scale:
Identify ALL reagents and instruments used in the analytical PROCEDURE (sample preparation + analysis).
REAGENTS:
For each distinct reagent/chemical used, estimate its amount class ('<10', '10-100', '>100' in mL or g).
Also estimate the number of GHS hazard pictograms it carries:
- warning_pictograms: count of 'Warning' level hazards.
- danger_pictograms: count of 'Danger' level (severe) hazards.
INSTRUMENTS:
For each main instrument used, estimate its energy class ('<0.1', '<1.5', '>1.5' in kWh per sample).
Example: <0.1 (titration, UV-Vis, FTIR), <1.5 (HPLC, GC, AAS), >1.5 (GC-MS, LC-MS, NMR, XRD).
OCCUPATIONAL HAZARD:
"hermetized" (closed system) OR "emission" (vapors/gases emitted to air).
WASTE:
- waste_volume: "none", "<1", "1-10", or ">10" (in mL/g).
- waste_treatment: "recycling", "degradation", "passivation", or "none".
Return ONLY a valid JSON object:
{
""" + _SUMMARY_SCHEMA + """
"reagents": [
{
"name": "<Reagent Name>",
"amount": "<10 | 10-100 | >100",
"warning_pictograms": <integer>,
"danger_pictograms": <integer>
}
],
"instruments": [
{
"name": "<Instrument Name>",
"energy": "<0.1 | <1.5 | >1.5"
}
],
"occupational_hazard": "<hermetized | emission>",
"waste_volume": "<none | <1 | 1-10 | >10>",
"waste_treatment": "<recycling | degradation | passivation | none>",
"_evidence": { "occupational_hazard": {"quote": "...", "reasoning": "..."}, ... }
}
""" + _EVIDENCE_EXTENSION + """
Document Text:
================
"""
CAFRI_PROMPT = """You are an expert analytical chemistry assistant. Extract Carbon Footprint Reduction Index (CaFRI) parameters AND article metadata from the following paper.
""" + _SUMMARY_INSTRUCTIONS + """
SMART EXTRACTION FOR CaFRI:
Extract the variables specifically relevant to the Carbon Footprint of the procedure.
If not directly stated, estimate based on the methodological details (e.g., if HPLC is used, estimate power, personnel, throughput normally associated).
JSON KEYS AND VALID VALUES:
- "energy_reduction_program": "Yes" | "No"
- "instrument_power": "< 0.1 kW" | "0.1-1.5 kW" | "> 1.5 kW"
- "energy_intensive_equipment": "Yes" | "No" (e.g., fume hood required?)
- "sample_throughput": "< 10" | "10-30" | "> 30" (samples/hour)
- "carbon_footprint_known": "Yes" | "No"
- "emission_factor": "< 0.1" | "0.1-0.3" | "> 0.3" (kg CO2/kWh)
- "storage": "No storage" | "Normal conditions" | "Special conditions"
- "transported_to_lab": "Yes" | "No"
- "distance": "< 1 mile" | "1-10 miles" | "> 10 miles"
- "samples_per_shipment": "1" | "2-10" | "11-100" | "> 100"
- "ecofriendly_vehicle": "Yes" | "No"
- "personnel_per_sample": "1" | "2-3" | "4-5" | "> 5"
- "automation": "Automatic" | "Semiautomatic" | "Manual"
- "waste_amount": "< 10 mL/g" | "10-100 mL/g" | "> 100 mL/g"
- "waste_disposal": "Specialized personnel" | "Analyst" | "No disposal"
- "recycling": "Same method" | "Other methods" | "No recycling"
- "pictograms": "<= 3" | "4-6" | "7-9" | "> 9" (Total GHS hazard pictograms for all reagents combined)
- "organic_solvents": "< 5 mL" | "5-10 mL" | "> 10 mL" (per sample)
- "reagents_amount": "< 1 g/mL" | "1-3 g/mL" | "> 3 g/mL" (per sample)
Return ONLY a valid JSON object matching these exactly:
{
""" + _SUMMARY_SCHEMA + """
"energy_reduction_program": "<Yes | No>",
"instrument_power": "<< 0.1 kW | 0.1-1.5 kW | > 1.5 kW>",
"energy_intensive_equipment": "<Yes | No>",
"sample_throughput": "<< 10 | 10-30 | > 30>",
"carbon_footprint_known": "<Yes | No>",
"emission_factor": "<< 0.1 | 0.1-0.3 | > 0.3>",
"storage": "<No storage | Normal conditions | Special conditions>",
"transported_to_lab": "<Yes | No>",
"distance": "<< 1 mile | 1-10 miles | > 10 miles>",
"samples_per_shipment": "<1 | 2-10 | 11-100 | > 100>",
"ecofriendly_vehicle": "<Yes | No>",
"personnel_per_sample": "<1 | 2-3 | 4-5 | > 5>",
"automation": "<Automatic | Semiautomatic | Manual>",
"waste_amount": "<< 10 mL/g | 10-100 mL/g | > 100 mL/g>",
"waste_disposal": "<Specialized personnel | Analyst | No disposal>",
"recycling": "<Same method | Other methods | No recycling>",
"pictograms": "<<= 3 | 4-6 | 7-9 | > 9>",
"organic_solvents": "<< 5 mL | 5-10 mL | > 10 mL>",
"reagents_amount": "<< 1 g/mL | 1-3 g/mL | > 3 g/mL>",
"_evidence": { "energy_reduction_program": {"quote": "...", "reasoning": "..."}, ... }
}
""" + _EVIDENCE_EXTENSION + """
Document Text:
================
"""
# ─── BAGI PROMPT ─────────────────────────────────────────────────────────────
BAGI_PROMPT = """You are an expert analytical chemistry assistant. Extract parameters to calculate the Blue Applicability Grade Index (BAGI) of the described analytical method.
""" + _SUMMARY_INSTRUCTIONS + """
══════════════════════════════════════════════
EXTRACT THESE 10 BAGI PARAMETERS BASED ON THE FOLLOWING RULES:
1. p1_type_of_analysis:
- "Quantitative and Confirmatory"
- "Quantitative"
- "Screening"
- "Qualitative"
2. p2_number_of_analytes:
- "Multi-element analysis for > 15 compounds"
- "Multi-element analysis for 6-15 compounds of the same chemical group or 2-15 compounds of different chemical classes"
- "Multi-element analysis for 2-5 compounds of the same chemical class"
- "Single Element"
3. p3_instrumentation:
- "Simple in operation portable instrumentation (smart-phone based detectors, portable GC, etc.)"
- "Simple instrumentation available in most labs (UV, HPLC-UV, HPLC-DAD, UHPLC, FAAS, ETAAS, ICP-OES, GC-FID etc.)"
- "Sophisticated instrumentation (LC-MS, GC-MS, ICP-MS, homemade interfaces, homemade automatic systems, etc.)"
- "Instrumentation that is not commonly available in most labs (SFC, 2D-GC, 2D-LC, LC-MS/MS, GC-MS/MS, etc.)"
4. p4_sample_prep_capacity:
- ">=96"
- "13-95"
- "2-12"
- "1"
5. p5_sample_prep_scale:
- "Not required or on-site sample preparation if required"
- "Simple, low-cost sample preparation required (e.g. protein precipitation)"
- "Miniaturized extraction sample preparation (SPME, DLLME, MEPS, SBSE, d-SPE, FPSE, etc.)"
- "Multi-step sample preparation required (e.g. LLE, SPE and/or derivatization)"
6. p6_sample_throughput:
- ">10"
- "5-10"
- "2-4"
- "<=1"
7. p7_reagents_and_materials:
- "Common commercially available reagents (e.g. methanol, acetonitrile, HNO3, etc.)"
- "Commercially available reagents not common in QC labs (derivatization reagents, SPE cartridges, SPME fibers, etc.)"
- "Need to be synthesized in the lab with common instrumentation and in a simple way"
- "Need to be synthesized in the lab with advanced equipment or know-how (specially designed metal-organic frameworks, modified nanomaterials, etc.)"
8. p8_preconcentration_req:
- "No preconcentration required. Required sensitivity and /or legislation criteria are met directly."
- "Preconcentration required. Required sensitivity is met with one-step preconcentration."
- "Preconcentration required. Legislation criteria met after complicated stages (e.g. extraction, evaporation, and reconstitution)."
9. p9_automation_degree:
- "Fully automated with novel technology advanced devices (robotics, lab-in-syringe, etc.)"
- "Semi-automated with common devices (e.g. HPLC autosampler)"
- "Semi-automated with non-common devices (e.g. homemade systems)"
- "Manual treatment and analysis"
10. p10_sample_amount_req:
- "<100 Β΅L (or mg) bioanalytical samples; <10 mL (or g) food/environmental"
- "100-500 Β΅L (or mg) bioanalytical samples; 10-50 mL (or g) food/environmental"
- "501-1000 Β΅L (or mg) bioanalytical samples; 50.1-100 mL (or g) food/environmental"
- ">1000 Β΅L (or mg) bioanalytical samples; >100 mL (or g) food/environmental"
Return ONLY a valid JSON object matching this schema exactly:
{
""" + _SUMMARY_SCHEMA + """
"p1_type_of_analysis": "Exact string from options",
"p2_number_of_analytes": "Exact string from options",
"p3_instrumentation": "Exact string from options",
"p4_sample_prep_capacity": "Exact string from options",
"p5_sample_prep_scale": "Exact string from options",
"p6_sample_throughput": "Exact string from options",
"p7_reagents_and_materials": "Exact string from options",
"p8_preconcentration_req": "Exact string from options",
"p9_automation_degree": "Exact string from options",
"p10_sample_amount_req": "Exact string from options",
"_evidence": { "p1_type_of_analysis": {"quote": "verbatim sentence from document", "reasoning": "why this option was chosen"}, "p2_number_of_analytes": {"quote": "...", "reasoning": "..."} }
}
Do NOT add any text after the closing \} brace.
Document Text:
================"""
RAPI_PROMPT = """You are an expert analytical chemistry assistant. Extract Red Analytical Performance Index (RAPI) parameters AND article metadata from the following paper.
""" + _SUMMARY_INSTRUCTIONS + """
RAPI EXTRACTION - CRITICAL READING INSTRUCTIONS:
In analytical chemistry papers, method validation parameters are ALMOST ALWAYS
reported in the Results and Discussion section, Method Validation subsection,
Figures of Merit table, or Validation Parameters section.
Do NOT restrict your search to the Experimental/Methods section alone.
Search EVERY section. Only return 'not tested' if truly absent from the entire paper.
10 PARAMETERS TO EXTRACT:
1. repeatability_rsd
WHAT: Intra-day precision expressed as %RSD.
WHERE: Results/Validation tables. Keywords: "repeatability", "intra-day precision",
"intra-assay precision", "%RSD", "CV%". Report the range or maximum value.
Example output: "RSD < 2.1% (intra-day, n=6)"
2. int_precision_rsd
WHAT: Inter-day/intermediate precision expressed as %RSD.
WHERE: Results/Validation tables. Keywords: "intermediate precision",
"inter-day precision", "inter-assay", values measured on DIFFERENT days.
Example output: "RSD < 3.5% (inter-day, 3 consecutive days)"
3. reproducibility_rsd
WHAT: Between-laboratory reproducibility %RSD.
WHERE: Keywords: "reproducibility", "between-lab", "interlaboratory".
Rarely reported - set to 'not tested' only if truly absent.
Example output: "not tested" OR "RSD < 5% (3 labs)"
4. trueness
WHAT: Accuracy as recovery % or bias from Certified Reference Material (CRM).
WHERE: Results/Validation. Keywords: "trueness", "accuracy", "CRM recovery",
"bias", "% deviation", "% error". Distinguished from spiked recovery by CRM use.
Example output: "98.5-101.2% (CRM: NIST SRM 1573a)"
5. recovery_me
WHAT: Matrix recovery % and/or Matrix Effect (ME) % from spiked samples.
WHERE: Results/Validation. Keywords: "recovery", "spiked samples",
"matrix effect", "signal suppression/enhancement", "IS-corrected recovery".
Example output: "Recovery: 89-110%, ME: -8 to +5%"
6. loq
WHAT: Limit of Quantification numerical value with unit.
WHERE: Any section. Keywords: "LOQ", "LLOQ", "limit of quantification",
"quantification limit". Usually in a Figures of Merit or Validation table.
Example output: "0.05 ug/mL" OR "0.1-2.5 ng/mL (analytes)"
7. working_range
WHAT: Validated linear/dynamic range from LOQ to upper limit.
WHERE: Any section. Keywords: "linear range", "working range",
"dynamic range", "calibration range".
Example output: "0.05-50 ug/mL"
8. linearity_r2
WHAT: Coefficient of determination R2 (or r2) for the calibration curve.
WHERE: Any section. Keywords: "R2", "r2", "correlation coefficient",
"linearity", "coefficient of determination".
Example output: "R2 > 0.998" OR "r = 0.9995"
9. robustness_factors
WHAT: Number of factors tested in a robustness/ruggedness study.
WHERE: Results/Experimental. Keywords: "robustness", "ruggedness",
"Plackett-Burman", "factorial design". Count the variables tested.
Example output: "7 factors (pH, temperature, flow rate...)"
10. selectivity_interferents
WHAT: Number of potential interferents explicitly tested for selectivity.
WHERE: Results/Validation. Keywords: "selectivity", "specificity",
"interferences", "interferents", "cross-reactivity".
Example output: "12 interferents tested" OR "not tested"
Return ONLY a valid JSON object exactly matching this schema:
{
""" + _SUMMARY_SCHEMA + """
"repeatability_rsd": "extracted value with context, or 'not tested'",
"int_precision_rsd": "extracted value with context, or 'not tested'",
"reproducibility_rsd": "extracted value with context, or 'not tested'",
"trueness": "extracted value with context, or 'not tested'",
"recovery_me": "extracted value with context, or 'not tested'",
"loq": "numerical value + unit, or 'not tested'",
"working_range": "range from-to with unit, or 'not tested'",
"linearity_r2": "R2 value or range, or 'not tested'",
"robustness_factors": "number of factors, or 'not tested'",
"selectivity_interferents": "number of interferents, or 'not tested'",
"_evidence": {
"repeatability_rsd": {"quote": "verbatim sentence from paper", "reasoning": "why"},
"int_precision_rsd": {"quote": "...", "reasoning": "..."},
"loq": {"quote": "...", "reasoning": "..."}
}
}
Do NOT add any text after the closing } brace.
Document Text:
================"""
def _clean_json(raw: str) -> dict:
"""Robust JSON extractor: handles fences, trailing text, truncation, ellipsis."""
import re
raw = raw.strip()
# 1. Strip markdown code fences
if "```" in raw:
parts = raw.split("```")
for part in parts:
cleaned = part.strip()
if cleaned.startswith("json"):
cleaned = cleaned[4:].strip()
if cleaned.startswith("{"):
raw = cleaned
break
raw = raw.strip()
# 2. Remove any trailing text after the last closing brace of the root object
# Walk from end to find balanced closing brace
depth = 0
end_idx = -1
in_str = False
escape = False
for i, ch in enumerate(raw):
if escape:
escape = False
continue
if ch == '\\' and in_str:
escape = True
continue
if ch == '"' and not escape:
in_str = not in_str
if not in_str:
if ch == '{':
depth += 1
elif ch == '}':
depth -= 1
if depth == 0:
end_idx = i
break
if end_idx != -1:
raw = raw[:end_idx + 1]
# 3. Remove ellipsis placeholders that break JSON (e.g. "...": ... or , ...)
raw = re.sub(r',\s*\.{2,}\s*}', '}', raw) # trailing , ... }
raw = re.sub(r',\s*\.{2,}\s*$', '', raw) # trailing , ...
raw = re.sub(r'"\.\.\."|\.\.\.', '"..."', raw) # bare ellipsis β†’ quoted
# 4. Attempt direct parse first
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
# 5. Try to fix missing comma before _evidence key
fixed = re.sub(r'"\s*\}\s*"_evidence"', '"}\n ,"_evidence"', raw)
try:
return json.loads(fixed)
except json.JSONDecodeError:
pass
# 6. Strip the _evidence block entirely and return core data
no_ev = re.sub(r',?\s*"_evidence"\s*:\s*\{[^}]*(?:\{[^}]*\}[^}]*)*\}', '', raw)
try:
return json.loads(no_ev)
except json.JSONDecodeError:
pass
# 7. Last resort: load with aggressive cleanup (truncated JSON)
# Try appending closing braces up to depth
for extra in ['}', '}}', '}}}']:
try:
return json.loads(raw + extra)
except json.JSONDecodeError:
pass
# Re-raise original error with context
raise json.JSONDecodeError(f"Could not repair LLM JSON output", raw, 0)
# ─── SHARED TARGET-TECHNIQUE OVERRIDE BUILDER ───────────────────────────────
def _build_target_override(target_technique: str) -> str:
"""
Returns a prompt injection instructing the LLM to focus ONLY on the
Experimental/Methods/Validation sections for the specified technique,
and to ignore any mention of it in Introduction/Literature Review.
"""
if not target_technique.strip():
return ""
return f"""
🚨 CRITICAL FOCUS INSTRUCTION β€” READ BEFORE EXTRACTING:
=============================================================
This article describes or compares MULTIPLE analytical methods.
YOU MUST EXTRACT PARAMETERS **ONLY** FOR: '{target_technique}'
STRICT RULES:
βœ“ SOURCE all parameter values EXCLUSIVELY from these sections:
β€’ Experimental / Materials & Methods
β€’ Instrumentation & Apparatus
β€’ Sample Preparation / Procedure
β€’ Method Validation / Results & Discussion for '{target_technique}' specifically
βœ— DO NOT extract values from:
β€’ Introduction or Literature Review (even if '{target_technique}' is mentioned there)
β€’ Comparison tables that cite other authors’ values β€” use ONLY in-study values
β€’ Any section describing a different technique, even if adjacent
HANDLING MISSING DATA:
- If a parameter is not reported for '{target_technique}' in this paper, set the value
to its default/null equivalent (do NOT invent or borrow values from another technique).
- In the _evidence quote field, write: "Not explicitly reported for {target_technique}"
if you cannot find a verbatim quote.
=============================================================
"""
# ─── GEMINI PROVIDER ─────────────────────────────────────────
class GeminiExtractor:
def __init__(self, api_key: str):
import google.generativeai as genai
genai.configure(api_key=api_key)
self.model = genai.GenerativeModel("gemini-2.0-flash")
def extract_text(self, pdf_path: str) -> str:
return _read_pdf(pdf_path)
def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]:
prompt_map = {
"nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT,
"ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT,
"bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT,
"scan_techniques": SCAN_TECHNIQUES_PROMPT,
}
base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT)
target_override = _build_target_override(target_technique)
prompt = base_prompt + target_override + text[:15000] + "\n================\nReturn ONLY the JSON object, no other text."
response = self.model.generate_content(prompt)
return _clean_json(response.text)
# ─── GROQ PROVIDER (Llama 3 β€” FREE) ─────────────────────────
class GroqExtractor:
def __init__(self, api_key: str):
from groq import Groq
self.client = Groq(api_key=api_key)
def extract_text(self, pdf_path: str) -> str:
return _read_pdf(pdf_path)
def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]:
prompt_map = {
"nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT,
"ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT,
"bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT,
"scan_techniques": SCAN_TECHNIQUES_PROMPT,
}
base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT)
target_override = _build_target_override(target_technique)
prompt = base_prompt + target_override + text[:12000] + "\n================\nReturn ONLY the JSON object, no markdown, no extra text."
response = self.client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{"role": "system", "content": "You are a JSON extraction assistant. Always respond with only valid JSON."},
{"role": "user", "content": prompt}
],
temperature=0.1,
max_tokens=2500,
)
return _clean_json(response.choices[0].message.content)
# ─── OPENAI PROVIDER ─────────────────────────────────────────
class OpenAIExtractor:
def __init__(self, api_key: str):
from openai import OpenAI
self.client = OpenAI(api_key=api_key)
def extract_text(self, pdf_path: str) -> str:
return _read_pdf(pdf_path)
def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]:
prompt_map = {
"nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT,
"ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT,
"bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT,
"scan_techniques": SCAN_TECHNIQUES_PROMPT,
}
base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT)
target_override = _build_target_override(target_technique)
prompt = base_prompt + target_override + text[:14000] + "\n================\nReturn ONLY the JSON object."
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a JSON extraction assistant. Always respond with only valid JSON."},
{"role": "user", "content": prompt}
],
temperature=0.1,
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
# ─── OPENROUTER (FREE) PROVIDER ──────────────────────────────
class OpenRouterExtractor:
def __init__(self, api_key: str):
from openai import OpenAI
self.client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key,
)
def extract_text(self, pdf_path: str) -> str:
return _read_pdf(pdf_path)
def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]:
prompt_map = {
"nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT,
"ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT,
"bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT,
"scan_techniques": SCAN_TECHNIQUES_PROMPT,
}
base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT)
target_override = _build_target_override(target_technique)
prompt = base_prompt + target_override + text[:14000] + "\n================\nReturn ONLY the JSON object, no markdown."
response = self.client.chat.completions.create(
model="meta-llama/llama-3.3-70b-instruct:free",
messages=[
{"role": "system", "content": "You are a JSON extraction assistant. Always respond with only valid JSON."},
{"role": "user", "content": prompt}
],
temperature=0.1,
max_tokens=3000,
)
return _clean_json(response.choices[0].message.content)
# ─── SHARED PDF READER ────────────────────────────────────────
def _read_pdf(pdf_path: str) -> str:
text = ""
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
t = page.extract_text()
if t:
text += t + "\n"
return text
# ─── LOCAL LLM PROVIDER (LM Studio / Ollama β€” OpenAI-compatible) ─────────────
class LocalLLMExtractor:
"""
Connects to any local OpenAI-compatible server.
api_key format: "<base_url>|<model_name>"
Examples:
LM Studio: "http://localhost:1234|lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF"
Ollama: "http://localhost:11434|llama3"
If the separator '|' is missing, the whole string is treated as base_url
and model defaults to 'local-model'.
"""
DEFAULT_URLS = {
"lmstudio": "http://localhost:1234",
"ollama": "http://localhost:11434",
}
def __init__(self, api_key: str, provider_hint: str = "local"):
from openai import OpenAI
if "|" in api_key:
base_url, self.model_name = api_key.split("|", 1)
base_url = base_url.strip().rstrip("/")
else:
# api_key may be empty or just a plain URL
base_url = api_key.strip().rstrip("/") or self.DEFAULT_URLS.get(provider_hint, "http://localhost:1234")
self.model_name = "local-model"
# Ensure the path ends with /v1 (required for OpenAI-compat)
if not base_url.endswith("/v1"):
base_url = base_url + "/v1"
self.client = OpenAI(base_url=base_url, api_key="not-needed")
def extract_text(self, pdf_path: str) -> str:
return _read_pdf(pdf_path)
def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]:
prompt_map = {
"nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT,
"ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT,
"bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT,
"scan_techniques": SCAN_TECHNIQUES_PROMPT,
}
base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT)
target_override = ""
if target_technique.strip():
target_override = (
f"\n\n🚨 TARGET TECHNIQUE OVERRIDE:\nThe article discusses the development of more than one technique. "
f"YOU MUST EXTRACT PARAMETERS ONLY FOR THE SPECIFIC TECHNIQUE: '{target_technique}'. "
f"DO NOT extract parameters for other methods compared in the study.\n"
)
prompt = base_prompt + target_override + text[:12000] + "\n================\nReturn ONLY the JSON object, no markdown, no extra text."
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": "You are a JSON extraction assistant. Always respond with only valid JSON."},
{"role": "user", "content": prompt}
],
temperature=0.1,
max_tokens=2500,
)
return _clean_json(response.choices[0].message.content)
# ─── FACTORY ─────────────────────────────────────────────────
def PDFExtractor(provider: str, api_key: str):
"""Factory: returns the right extractor for the chosen provider."""
p = provider.lower()
if p in ("lmstudio", "ollama", "local"):
return LocalLLMExtractor(api_key, provider_hint=p)
providers = {
"gemini": GeminiExtractor,
"groq": GroqExtractor,
"openai": OpenAIExtractor,
"openrouter": OpenRouterExtractor,
}
cls = providers.get(p)
if not cls:
raise ValueError(f"Unknown provider '{provider}'. Choose from: {list(providers.keys()) + ['lmstudio', 'ollama']}")
return cls(api_key)