Spaces:
Running
Running
| import json | |
| import PyPDF2 | |
| from typing import Dict, Any | |
| # βββ CRITICAL UNIT RULES (used in prompt) ββββββββββββββββββββββββββββββββββββ | |
| # All numeric amounts MUST be reported in grams (g) or milliliters (mL). | |
| # βββ SHARED ARTICLE SUMMARY SCHEMA βββββββββββββββββββββββββββββββββββββββββββ | |
| _SUMMARY_SCHEMA = '''"article_summary": { | |
| "analytes": ["list ALL analytes/compounds being determined. E.g. Metformin, Sitagliptin"], | |
| "matrices": ["list ALL sample matrices. E.g. Tablet, Plasma, Urine, Serum, Saliva, Water, Food, Soil"], | |
| "techniques": ["list ALL techniques MENTIONED anywhere in the paper (including literature review)"], | |
| "developed_techniques": ["list ONLY the technique(s) that were DEVELOPED, VALIDATED, or EXPERIMENTALLY APPLIED in THIS paper. These must have an Experimental section, instrumentation setup, validation data (LOD/LOQ/precision/recovery) or sample preparation described in detail. Do NOT include techniques only cited or compared from literature."], | |
| "quantification_method": "External Standard | Standard Addition | Internal Standard | Calibration Curve", | |
| "validation_status": "Fully Validated | Partially Validated | Not Validated", | |
| "key_findings": "1-2 sentence summary of the paper purpose and conclusion" | |
| },''' | |
| _SUMMARY_INSTRUCTIONS = """ | |
| ββββββββββββββββββββββββββββββββββββββββββββββ | |
| ARTICLE SUMMARY EXTRACTION RULES: | |
| - analytes: EVERY compound measured. Check title, abstract, intro, conclusions. | |
| - matrices: ALL biological/pharmaceutical/environmental matrices. | |
| - techniques (all-mentioned): ALL instruments/techniques cited anywhere β Intro, Methods, Results, Discussion. | |
| - developed_techniques (KEY FIELD β read carefully): | |
| β INCLUDE a technique if the paper contains: | |
| β’ A dedicated experimental/methods subsection describing the instrument setup for it | |
| β’ Validation data specific to it (linearity, LOD, LOQ, precision, accuracy, recovery, robustness) | |
| β’ Sample-preparation steps designed for it | |
| β’ Chromatographic/spectral conditions (column, mobile phase, wavelength, gradient, etc.) | |
| β EXCLUDE a technique if it is only: | |
| β’ Mentioned in Introduction or Literature Review as a comparison or background reference | |
| β’ Cited as βpreviously reported by [author]β or βother studies used Xβ | |
| β’ Referenced for comparison in a table without being experimentally performed in this paper | |
| - validation_status: Look for βICHβ, βUSPβ, βlinearityβ, βLODβ, βLOQβ, βprecisionβ, βaccuracyβ, βrecoveryβ. | |
| ββββββββββββββββββββββββββββββββββββββββββββββ | |
| """ | |
| # βββ LIGHTWEIGHT TECHNIQUE SCANNER (Phase 1 only) βββββββββββββββββββββββββββββ | |
| SCAN_TECHNIQUES_PROMPT = """You are an expert analytical chemistry assistant reviewing a scientific article. | |
| Your SOLE task is to identify which analytical technique(s) were actually DEVELOPED, OPTIMISED | |
| or VALIDATED in this paper β as distinct from those merely cited in the literature review. | |
| """ + _SUMMARY_INSTRUCTIONS + """ | |
| CRITICAL CLASSIFICATION RULES FOR 'developed_techniques': | |
| ================================================ | |
| A technique belongs in 'developed_techniques' ONLY if the paper contains AT LEAST ONE of: | |
| β A dedicated Experimental/Materials & Methods subsection describing instrument setup for it | |
| β Specific chromatographic/spectral conditions (column, mobile phase, wavelength, temperature, | |
| gradient, flow rate, detector settings, etc.) for it | |
| β Validation results generated in this study (linearity, LOD, LOQ, precision, accuracy, | |
| recovery, robustness, stability) | |
| β Sample preparation steps specifically designed around it | |
| β New application of the technique to a matrix or compound presented as the paper\'s | |
| original contribution | |
| A technique must NOT be in 'developed_techniques' if it only appears as: | |
| β A reference cited in the Introduction or Literature Review ("Smith et al. used HPLC-UV...") | |
| β A technique listed in a comparison table copied from literature without being run here | |
| β General background or motivation text | |
| β A technique applied by a previous author that this paper is replacing or improving upon | |
| ================================================ | |
| IMPORTANT: A single paper may develop MORE THAN ONE technique (e.g. both HPLC-UV and | |
| spectrophotometry). List ALL techniques that qualify by the rules above. | |
| Return ONLY a valid JSON object: | |
| { | |
| "article_summary": { | |
| "analytes": ["ALL analytes measured in this study"], | |
| "matrices": ["ALL sample matrices used in this study"], | |
| "techniques": ["ALL techniques MENTIONED anywhere in the paper (intro, methods, results)"], | |
| "developed_techniques": [ | |
| "ONLY techniques with experimental setup/validation in THIS paper β apply the rules above.", | |
| "Be SPECIFIC: HPLC-UV not just HPLC. GC-FID not just GC." | |
| ], | |
| "quantification_method": "External Standard | Standard Addition | Internal Standard | Calibration Curve", | |
| "validation_status": "Fully Validated | Partially Validated | Not Validated", | |
| "key_findings": "1-2 sentence summary of the paper purpose and conclusion" | |
| } | |
| } | |
| Document Text: | |
| ================ | |
| """ | |
| # βββ EVIDENCE EXTENSION appended to every prompt before Document Text βββββββββ | |
| _EVIDENCE_EXTENSION = """ | |
| ββββββββββββββββββββββββββββββββββββββββββββββ | |
| EVIDENCE REQUIREMENT β YOU MUST ALSO RETURN "_evidence": | |
| For EVERY parameter key you return in the JSON above, also include a matching entry | |
| in an "_evidence" object with TWO fields: | |
| "quote": copy the EXACT sentence(s) from the document text that led to your answer | |
| (max 2 sentences, verbatim from the text) | |
| "reasoning": 1-2 sentences explaining HOW you interpreted the quote to arrive at the value | |
| IMPORTANT: YOU MUST USE THE EXACT PARAMETER KEYS FROM YOUR SPECIFIC EXTRACTION TASK! | |
| Do NOT use dummy keys. For example, if you extracted a parameter named "p1_type_of_analysis", | |
| your _evidence block MUST contain "p1_type_of_analysis". | |
| Bad example (too vague): | |
| "_evidence": { "some_parameter_key": { "quote": "HPLC was used", "reasoning": "It's HPLC" } } | |
| Good example (specific, verbatim, explains the logic, using the exact key you extracted): | |
| "_evidence": { | |
| "YOUR_EXACT_PARAMETER_KEY_HERE": { | |
| "quote": "Chromatographic separation was performed on a Waters Acquity UPLC H-Class system with a BEH C18 column at 1.0 mL/min.", | |
| "reasoning": "The system described is a UPLC instrument. The flow rate and BEH column are characteristic of UPLC, not standard HPLC." | |
| } | |
| } | |
| Include _evidence for ALL parameter keys (article_summary fields + every scoring parameter). | |
| For article_summary sub-fields (analytes, matrices, techniques), use the quote from the abstract or title. | |
| For boolean fields, the quote should show the text that CONFIRMED or DENIED the feature. | |
| ββββββββββββββββββββββββββββββββββββββββββββββ | |
| """ | |
| EXTRACTION_PROMPT = """You are an expert analytical chemistry assistant. Extract green chemistry parameters AND article metadata from the following analytical chemistry paper. | |
| ββββββββββββββββββββββββββββββββββββββββββββββ | |
| UNIT CONVERSION RULES β APPLY BEFORE WRITING JSON: | |
| Β΅gβg: /1,000,000 | mgβg: /1,000 | kgβg: Γ1,000 | |
| Β΅LβmL: /1,000 | nLβmL: /1,000,000 | LβmL: Γ1,000 | |
| Waste (p7): SUM ALL solvents+reagents+mobile phase per run. | |
| Toxicity (p11): SUM only hazardous solvents (MeCN, MeOH, DCM, CHCl3, hexane, toluene, ethyl acetate). | |
| ββββββββββββββββββββββββββββββββββββββββββββββ | |
| SMART EXTRACTION GUIDANCE: | |
| - p1 (Sample treatment): Look for "in-line","on-line","at-line","off-line","remote sensing","non-invasive","direct analysis". | |
| - p2 (Sample amount): "sample weight","sample volume","aliquot","injection volume" in mg/Β΅g/Β΅L/g/mL. | |
| - p3 (Device positioning): Classify device relationship to process: in-line/at-line/off-line. | |
| - p4 (Steps): Count each: weighing, dissolving, filtration, centrifugation, SPE, LLE, dilution, derivatization, evaporation, reconstitution, sonication. | |
| - p5 (Automation): "automated","autosampler","FIA","robotic","manual","semi-automated". Miniaturized: "micro","nano","chip","capillary","Β΅L volumes". | |
| - p6 (Derivatization): "derivatization","pre-column reaction","post-column reaction","fluorescent tag","ion pair","labeling agent". | |
| - p7 (Waste): For LC/HPLC: flow_rate_mL_per_min Γ run_time_min. For batch: sum all solvent volumes. | |
| - p8 (Throughput): Count target analytes. Runs/hour = 60/analysis_time_minutes. | |
| - p9 (Technique): Identify primary from: HPLC, UPLC, GC, GC-MS, LC-MS, LC-MS/MS, HPTLC, TLC, UV-Vis, Fluorimetry, Potentiometry, Electrochemistry, AAS, ICP-MS, NMR, FTIR, NIR. | |
| - p10 (Bio-based): "green solvents","bio-based","ethanol","water","CO2","deep eutectic solvents","ionic liquids". | |
| - p11 (Toxic): acetonitrile, methanol, chloroform, dichloromethane, hexane, toluene, ethyl acetate, acetone. | |
| - p12 (GHS threats 0-7): harmful to aquatic=1, bioaccumulative=1, persistent=1, flammable=1, oxidizing=1, explosive=1, corrosive=1. | |
| """ + _SUMMARY_INSTRUCTIONS + """ | |
| Return ONLY a valid JSON object: | |
| { | |
| """ + _SUMMARY_SCHEMA + """ | |
| "p1_option": "one of: Remote sensing without sample damage | Remote sensing with little physical damage | Non-invasive analysis | In-field sampling and direct analysis | In-field sampling and on-line analysis | On-line analysis | At-line analysis | Off-line analysis | External sample pre- and treatment and batch analysis (reduced number of steps) | External sample pre- and treatment and batch analysis (large number of steps)", | |
| "p2_amount": <float in g or mL>, | |
| "p3_option": "one of: in-line | on-line | at-line | off-line", | |
| "p4_steps": <integer>, | |
| "p5_automation": "one of: automatic | semi-automatic | manual", | |
| "p5_miniaturized": <true/false>, | |
| "p6_has_derivatization": <true/false>, | |
| "p6_agents_cas": ["list of CAS strings"], | |
| "p7_waste_amount": <float mL>, | |
| "p8_analytes_per_run": <integer>, | |
| "p8_runs_per_hour": <float>, | |
| "p9_technique": "one of: None | FTIR | Hot plate solvent evaporation (<10 min) | Rotary evaporation | Needle evaporation | Ultrasound-assisted extraction | SPE and SPME | Microbiological assays | Immunoassay | Spectrofluorometry | Titration | UPLC | UV-Vis Spectrometry | Energy dispersive X-ray fluorescence | Potentiometry | Non-instrumental detection | Hot plate solvent evaporation (10-150 min) | Accelerated solvent extraction | Supercritical fluid extraction | Microwave assisted extraction | Flame atomic absorption spectrometry | Electrothermal atomic absorption spectrometry | GC | ICP-MS | ICP-OES | LC | Hot plate solvent evaporation (>150 min) | Soxhlet extraction | NMR | GC-MS | LC-MS | X-ray diffractometry", | |
| "p10_biobased_status": "one of: No reagents | All reagents are bio-based | Some reagents are bio-based | None of the reagents are from bio-based sources", | |
| "p11_has_toxic_reagents": <true/false>, | |
| "p11_toxic_amount": <float mL>, | |
| "p12_threats_count": <integer 0-7>, | |
| "_evidence": { "p1_option": {"quote": "...", "reasoning": "..."}, ... } | |
| } | |
| """ + _EVIDENCE_EXTENSION + """ | |
| Document Text: | |
| ================ | |
| """ | |
| NQS_PROMPT = """You are an expert analytical chemistry assistant. Extract NQS (Need, Quality, and Sustainability) parameters AND article metadata from the following analytical chemistry paper. | |
| """ + _SUMMARY_INSTRUCTIONS + """ | |
| Return ONLY a valid JSON object: | |
| { | |
| """ + _SUMMARY_SCHEMA + """ | |
| "need_tier": <integer 1-4: 4=simple/wasteless, 3=automated/high-throughput, 2=real-time/trained personnel, 1=high-performance/large-consumption>, | |
| "r1": <float 0-100: Scope β analytes count, matrices covered>, | |
| "r2": <float 0-100: LOD/LOQ sensitivity β ng/mL level = high score>, | |
| "r3": <float 0-100: Precision β RSD<2%=90-100, 2-5%=70-90, >5%=<70>, | |
| "r4": <float 0-100: Accuracy/Recovery β 98-102%=95-100, 95-105%=80-95, other=<70>, | |
| "g1": <float 0-100: Toxicity safety β 100=no hazardous reagents>, | |
| "g2": <float 0-100: Waste amount β 100=minimal waste>, | |
| "g3": <float 0-100: Energy β 100=ambient temp no heating>, | |
| "g4": <float 0-100: Direct operator safety β 100=closed system no vapour>, | |
| "b1": <float 0-100: Cost β 100=cheapest reagents/equipment>, | |
| "b2": <float 0-100: Time β 100=fastest analysis>, | |
| "b3": <float 0-100: Infrastructure β 100=bench-top no specialist facility>, | |
| "b4": <float 0-100: Simplicity β 100=fewest steps minimal training>, | |
| "sdg_1": <true/false: No Poverty β affordable diagnostics>, | |
| "sdg_2": <true/false: Zero Hunger β food safety>, | |
| "sdg_3": <true/false: Good Health β pharmaceutical/clinical/biomedical>, | |
| "sdg_4": <true/false: Quality Education>, | |
| "sdg_5": <true/false: Gender Equality>, | |
| "sdg_6": <true/false: Clean Water β water quality/environmental>, | |
| "sdg_7": <true/false: Clean Energy>, | |
| "sdg_8": <true/false: Economic Growth>, | |
| "sdg_9": <true/false: Innovation β novel methodology>, | |
| "sdg_10": <true/false: Reduced Inequality>, | |
| "sdg_11": <true/false: Sustainable Communities β environmental monitoring>, | |
| "sdg_12": <true/false: Responsible Consumption β green chemistry>, | |
| "sdg_13": <true/false: Climate Action β environmental impact>, | |
| "sdg_14": <true/false: Life Below Water β aquatic contamination>, | |
| "sdg_15": <true/false: Life on Land β soil/plant contamination>, | |
| "sdg_16": <true/false: Peace and Justice β forensic/regulatory>, | |
| "sdg_17": <true/false: Partnerships β multi-institution study>, | |
| "_evidence": { "need_tier": {"quote": "...", "reasoning": "..."}, ... } | |
| } | |
| """ + _EVIDENCE_EXTENSION + """ | |
| Document Text: | |
| ================ | |
| """ | |
| COMPLEXMOGAPI_PROMPT = """You are an expert analytical chemistry assistant. Extract ComplexMoGAPI parameters AND article metadata from the following paper. | |
| """ + _SUMMARY_INSTRUCTIONS + """ | |
| SMART EXTRACTION FOR ComplexMoGAPI: | |
| PRE-ANALYSIS STAGE: | |
| - pre_yield: yield/recovery % in pre-analytical step. >89%=0, 70-89%=1, <70%=2. (Use 3 if Not Applicable) | |
| - pre_temp_time: room temp <1h=0, room temp >1h OR heating <1h=1, heating >1h OR <0Β°C=2. (Use 3 if Not Applicable) | |
| - pre_green_economy: count green chemistry rules (atom economy, catalysis, renewable feedstocks). 5-6=0, 3-4=1, 1-2=2. | |
| - pre_health_hazard / pre_safety_hazard: NFPA/GHS of pre-analytical reagents. 0-1=0, 2-3=1, 4=2. | |
| - pre_instrument: common bench=0, semi-advanced=1, autoclave/glove box/high pressure=2. | |
| - pre_energy: β€0.1kWh=0, β€1.5kWh=1, >1.5kWh=2. Estimate from heating time Γ temperature Γ equipment. | |
| - pre_occupational: hermetized=0, partial=1, vapours to atmosphere=2. | |
| - pre_workup: none/simple=0, standard (filtration/washing)=1, advanced (chromatography/recrystallization)=2. (Use 3 if Not Applicable) | |
| - pre_purity: >98%=0, 97-98%=1, <97%=2. (Use 3 if Not Applicable) | |
| ANALYTICAL STAGE: | |
| - an_collection: in-line (no sampling)=0, on/at-line=1, off-line (lab batch)=2. | |
| - an_preservation: none=0, chemical/physical (freezing/acidification)=1, physicochemical=2. | |
| - an_transport: none=0, required=1, not applicable=2. | |
| - an_storage: room temp=0, refrigeration=1, special (controlled atmosphere/dark/frozen)=2. | |
| - an_method_type: direct analysis no prep=0, simple (dissolution/filtration)=1, extraction required=2. | |
| - an_extraction_scale: nano (<1Β΅L)=0, micro (1-1000Β΅L)=1, macro (>1mL)=2. (Use 3 if Not Applicable) | |
| - an_solvents_type: solvent-free=0, green (water/ethanol)=1, non-green (MeCN/MeOH/DCM)=2. | |
| - an_additional_treatments: none=0, simple (clean-up/evaporation)=1, advanced (derivatization/mineralization)=2. | |
| - an_amount: <10mL=0, 10-100mL=1, >100mL=2. | |
| - an_health_hazard / an_safety_hazard: NFPA/GHS analytical reagents. | |
| - an_energy: UV ~0.05kWh=0, HPLC ~0.5kWh=1, GC-MS/ICP ~1kWh=2. | |
| - an_occupational: sealed automated=0, partial=1, open solvent handling=2. | |
| - an_waste: <1mL=0, 1-10mL=1, >10mL=2. | |
| - an_waste_treatment: recycling/distillation=0, degradation/passivation=1, drain/no treatment=2. | |
| 0=Green, 1=Yellow, 2=Red, 3=Not Applicable (only for allowed parameters). | |
| Return ONLY a valid JSON object: | |
| { | |
| """ + _SUMMARY_SCHEMA + """ | |
| "pre_yield": <0|1|2|3>, | |
| "pre_temp_time": <0|1|2|3>, | |
| "pre_green_economy": <0|1|2>, | |
| "pre_health_hazard": <0|1|2>, | |
| "pre_safety_hazard": <0|1|2>, | |
| "pre_instrument": <0|1|2>, | |
| "pre_energy": <0|1|2>, | |
| "pre_occupational": <0|1|2>, | |
| "pre_workup": <0|1|2|3>, | |
| "pre_purity": <0|1|2|3>, | |
| "an_collection": <0|1|2>, | |
| "an_preservation": <0|1|2>, | |
| "an_transport": <0|1|2>, | |
| "an_storage": <0|1|2>, | |
| "an_method_type": <0|1|2>, | |
| "an_extraction_scale": <0|1|2|3>, | |
| "an_solvents_type": <0|1|2>, | |
| "an_additional_treatments": <0|1|2>, | |
| "an_amount": <0|1|2>, | |
| "an_health_hazard": <0|1|2>, | |
| "an_safety_hazard": <0|1|2>, | |
| "an_energy": <0|1|2>, | |
| "an_occupational": <0|1|2>, | |
| "an_waste": <0|1|2>, | |
| "an_waste_treatment": <0|1|2>, | |
| "_evidence": { "pre_yield": {"quote": "...", "reasoning": "..."}, ... } | |
| } | |
| """ + _EVIDENCE_EXTENSION + """ | |
| Document Text: | |
| ================ | |
| """ | |
| ECOSCALE_PROMPT = """You are an expert analytical chemistry assistant. Extract Analytical Eco-Scale parameters AND article metadata from the following paper. | |
| """ + _SUMMARY_INSTRUCTIONS + """ | |
| SMART EXTRACTION FOR Analytical Eco-Scale: | |
| Identify ALL reagents and instruments used in the analytical PROCEDURE (sample preparation + analysis). | |
| REAGENTS: | |
| For each distinct reagent/chemical used, estimate its amount class ('<10', '10-100', '>100' in mL or g). | |
| Also estimate the number of GHS hazard pictograms it carries: | |
| - warning_pictograms: count of 'Warning' level hazards. | |
| - danger_pictograms: count of 'Danger' level (severe) hazards. | |
| INSTRUMENTS: | |
| For each main instrument used, estimate its energy class ('<0.1', '<1.5', '>1.5' in kWh per sample). | |
| Example: <0.1 (titration, UV-Vis, FTIR), <1.5 (HPLC, GC, AAS), >1.5 (GC-MS, LC-MS, NMR, XRD). | |
| OCCUPATIONAL HAZARD: | |
| "hermetized" (closed system) OR "emission" (vapors/gases emitted to air). | |
| WASTE: | |
| - waste_volume: "none", "<1", "1-10", or ">10" (in mL/g). | |
| - waste_treatment: "recycling", "degradation", "passivation", or "none". | |
| Return ONLY a valid JSON object: | |
| { | |
| """ + _SUMMARY_SCHEMA + """ | |
| "reagents": [ | |
| { | |
| "name": "<Reagent Name>", | |
| "amount": "<10 | 10-100 | >100", | |
| "warning_pictograms": <integer>, | |
| "danger_pictograms": <integer> | |
| } | |
| ], | |
| "instruments": [ | |
| { | |
| "name": "<Instrument Name>", | |
| "energy": "<0.1 | <1.5 | >1.5" | |
| } | |
| ], | |
| "occupational_hazard": "<hermetized | emission>", | |
| "waste_volume": "<none | <1 | 1-10 | >10>", | |
| "waste_treatment": "<recycling | degradation | passivation | none>", | |
| "_evidence": { "occupational_hazard": {"quote": "...", "reasoning": "..."}, ... } | |
| } | |
| """ + _EVIDENCE_EXTENSION + """ | |
| Document Text: | |
| ================ | |
| """ | |
| CAFRI_PROMPT = """You are an expert analytical chemistry assistant. Extract Carbon Footprint Reduction Index (CaFRI) parameters AND article metadata from the following paper. | |
| """ + _SUMMARY_INSTRUCTIONS + """ | |
| SMART EXTRACTION FOR CaFRI: | |
| Extract the variables specifically relevant to the Carbon Footprint of the procedure. | |
| If not directly stated, estimate based on the methodological details (e.g., if HPLC is used, estimate power, personnel, throughput normally associated). | |
| JSON KEYS AND VALID VALUES: | |
| - "energy_reduction_program": "Yes" | "No" | |
| - "instrument_power": "< 0.1 kW" | "0.1-1.5 kW" | "> 1.5 kW" | |
| - "energy_intensive_equipment": "Yes" | "No" (e.g., fume hood required?) | |
| - "sample_throughput": "< 10" | "10-30" | "> 30" (samples/hour) | |
| - "carbon_footprint_known": "Yes" | "No" | |
| - "emission_factor": "< 0.1" | "0.1-0.3" | "> 0.3" (kg CO2/kWh) | |
| - "storage": "No storage" | "Normal conditions" | "Special conditions" | |
| - "transported_to_lab": "Yes" | "No" | |
| - "distance": "< 1 mile" | "1-10 miles" | "> 10 miles" | |
| - "samples_per_shipment": "1" | "2-10" | "11-100" | "> 100" | |
| - "ecofriendly_vehicle": "Yes" | "No" | |
| - "personnel_per_sample": "1" | "2-3" | "4-5" | "> 5" | |
| - "automation": "Automatic" | "Semiautomatic" | "Manual" | |
| - "waste_amount": "< 10 mL/g" | "10-100 mL/g" | "> 100 mL/g" | |
| - "waste_disposal": "Specialized personnel" | "Analyst" | "No disposal" | |
| - "recycling": "Same method" | "Other methods" | "No recycling" | |
| - "pictograms": "<= 3" | "4-6" | "7-9" | "> 9" (Total GHS hazard pictograms for all reagents combined) | |
| - "organic_solvents": "< 5 mL" | "5-10 mL" | "> 10 mL" (per sample) | |
| - "reagents_amount": "< 1 g/mL" | "1-3 g/mL" | "> 3 g/mL" (per sample) | |
| Return ONLY a valid JSON object matching these exactly: | |
| { | |
| """ + _SUMMARY_SCHEMA + """ | |
| "energy_reduction_program": "<Yes | No>", | |
| "instrument_power": "<< 0.1 kW | 0.1-1.5 kW | > 1.5 kW>", | |
| "energy_intensive_equipment": "<Yes | No>", | |
| "sample_throughput": "<< 10 | 10-30 | > 30>", | |
| "carbon_footprint_known": "<Yes | No>", | |
| "emission_factor": "<< 0.1 | 0.1-0.3 | > 0.3>", | |
| "storage": "<No storage | Normal conditions | Special conditions>", | |
| "transported_to_lab": "<Yes | No>", | |
| "distance": "<< 1 mile | 1-10 miles | > 10 miles>", | |
| "samples_per_shipment": "<1 | 2-10 | 11-100 | > 100>", | |
| "ecofriendly_vehicle": "<Yes | No>", | |
| "personnel_per_sample": "<1 | 2-3 | 4-5 | > 5>", | |
| "automation": "<Automatic | Semiautomatic | Manual>", | |
| "waste_amount": "<< 10 mL/g | 10-100 mL/g | > 100 mL/g>", | |
| "waste_disposal": "<Specialized personnel | Analyst | No disposal>", | |
| "recycling": "<Same method | Other methods | No recycling>", | |
| "pictograms": "<<= 3 | 4-6 | 7-9 | > 9>", | |
| "organic_solvents": "<< 5 mL | 5-10 mL | > 10 mL>", | |
| "reagents_amount": "<< 1 g/mL | 1-3 g/mL | > 3 g/mL>", | |
| "_evidence": { "energy_reduction_program": {"quote": "...", "reasoning": "..."}, ... } | |
| } | |
| """ + _EVIDENCE_EXTENSION + """ | |
| Document Text: | |
| ================ | |
| """ | |
| # βββ BAGI PROMPT βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| BAGI_PROMPT = """You are an expert analytical chemistry assistant. Extract parameters to calculate the Blue Applicability Grade Index (BAGI) of the described analytical method. | |
| """ + _SUMMARY_INSTRUCTIONS + """ | |
| ββββββββββββββββββββββββββββββββββββββββββββββ | |
| EXTRACT THESE 10 BAGI PARAMETERS BASED ON THE FOLLOWING RULES: | |
| 1. p1_type_of_analysis: | |
| - "Quantitative and Confirmatory" | |
| - "Quantitative" | |
| - "Screening" | |
| - "Qualitative" | |
| 2. p2_number_of_analytes: | |
| - "Multi-element analysis for > 15 compounds" | |
| - "Multi-element analysis for 6-15 compounds of the same chemical group or 2-15 compounds of different chemical classes" | |
| - "Multi-element analysis for 2-5 compounds of the same chemical class" | |
| - "Single Element" | |
| 3. p3_instrumentation: | |
| - "Simple in operation portable instrumentation (smart-phone based detectors, portable GC, etc.)" | |
| - "Simple instrumentation available in most labs (UV, HPLC-UV, HPLC-DAD, UHPLC, FAAS, ETAAS, ICP-OES, GC-FID etc.)" | |
| - "Sophisticated instrumentation (LC-MS, GC-MS, ICP-MS, homemade interfaces, homemade automatic systems, etc.)" | |
| - "Instrumentation that is not commonly available in most labs (SFC, 2D-GC, 2D-LC, LC-MS/MS, GC-MS/MS, etc.)" | |
| 4. p4_sample_prep_capacity: | |
| - ">=96" | |
| - "13-95" | |
| - "2-12" | |
| - "1" | |
| 5. p5_sample_prep_scale: | |
| - "Not required or on-site sample preparation if required" | |
| - "Simple, low-cost sample preparation required (e.g. protein precipitation)" | |
| - "Miniaturized extraction sample preparation (SPME, DLLME, MEPS, SBSE, d-SPE, FPSE, etc.)" | |
| - "Multi-step sample preparation required (e.g. LLE, SPE and/or derivatization)" | |
| 6. p6_sample_throughput: | |
| - ">10" | |
| - "5-10" | |
| - "2-4" | |
| - "<=1" | |
| 7. p7_reagents_and_materials: | |
| - "Common commercially available reagents (e.g. methanol, acetonitrile, HNO3, etc.)" | |
| - "Commercially available reagents not common in QC labs (derivatization reagents, SPE cartridges, SPME fibers, etc.)" | |
| - "Need to be synthesized in the lab with common instrumentation and in a simple way" | |
| - "Need to be synthesized in the lab with advanced equipment or know-how (specially designed metal-organic frameworks, modified nanomaterials, etc.)" | |
| 8. p8_preconcentration_req: | |
| - "No preconcentration required. Required sensitivity and /or legislation criteria are met directly." | |
| - "Preconcentration required. Required sensitivity is met with one-step preconcentration." | |
| - "Preconcentration required. Legislation criteria met after complicated stages (e.g. extraction, evaporation, and reconstitution)." | |
| 9. p9_automation_degree: | |
| - "Fully automated with novel technology advanced devices (robotics, lab-in-syringe, etc.)" | |
| - "Semi-automated with common devices (e.g. HPLC autosampler)" | |
| - "Semi-automated with non-common devices (e.g. homemade systems)" | |
| - "Manual treatment and analysis" | |
| 10. p10_sample_amount_req: | |
| - "<100 Β΅L (or mg) bioanalytical samples; <10 mL (or g) food/environmental" | |
| - "100-500 Β΅L (or mg) bioanalytical samples; 10-50 mL (or g) food/environmental" | |
| - "501-1000 Β΅L (or mg) bioanalytical samples; 50.1-100 mL (or g) food/environmental" | |
| - ">1000 Β΅L (or mg) bioanalytical samples; >100 mL (or g) food/environmental" | |
| Return ONLY a valid JSON object matching this schema exactly: | |
| { | |
| """ + _SUMMARY_SCHEMA + """ | |
| "p1_type_of_analysis": "Exact string from options", | |
| "p2_number_of_analytes": "Exact string from options", | |
| "p3_instrumentation": "Exact string from options", | |
| "p4_sample_prep_capacity": "Exact string from options", | |
| "p5_sample_prep_scale": "Exact string from options", | |
| "p6_sample_throughput": "Exact string from options", | |
| "p7_reagents_and_materials": "Exact string from options", | |
| "p8_preconcentration_req": "Exact string from options", | |
| "p9_automation_degree": "Exact string from options", | |
| "p10_sample_amount_req": "Exact string from options", | |
| "_evidence": { "p1_type_of_analysis": {"quote": "verbatim sentence from document", "reasoning": "why this option was chosen"}, "p2_number_of_analytes": {"quote": "...", "reasoning": "..."} } | |
| } | |
| Do NOT add any text after the closing \} brace. | |
| Document Text: | |
| ================""" | |
| RAPI_PROMPT = """You are an expert analytical chemistry assistant. Extract Red Analytical Performance Index (RAPI) parameters AND article metadata from the following paper. | |
| """ + _SUMMARY_INSTRUCTIONS + """ | |
| RAPI EXTRACTION - CRITICAL READING INSTRUCTIONS: | |
| In analytical chemistry papers, method validation parameters are ALMOST ALWAYS | |
| reported in the Results and Discussion section, Method Validation subsection, | |
| Figures of Merit table, or Validation Parameters section. | |
| Do NOT restrict your search to the Experimental/Methods section alone. | |
| Search EVERY section. Only return 'not tested' if truly absent from the entire paper. | |
| 10 PARAMETERS TO EXTRACT: | |
| 1. repeatability_rsd | |
| WHAT: Intra-day precision expressed as %RSD. | |
| WHERE: Results/Validation tables. Keywords: "repeatability", "intra-day precision", | |
| "intra-assay precision", "%RSD", "CV%". Report the range or maximum value. | |
| Example output: "RSD < 2.1% (intra-day, n=6)" | |
| 2. int_precision_rsd | |
| WHAT: Inter-day/intermediate precision expressed as %RSD. | |
| WHERE: Results/Validation tables. Keywords: "intermediate precision", | |
| "inter-day precision", "inter-assay", values measured on DIFFERENT days. | |
| Example output: "RSD < 3.5% (inter-day, 3 consecutive days)" | |
| 3. reproducibility_rsd | |
| WHAT: Between-laboratory reproducibility %RSD. | |
| WHERE: Keywords: "reproducibility", "between-lab", "interlaboratory". | |
| Rarely reported - set to 'not tested' only if truly absent. | |
| Example output: "not tested" OR "RSD < 5% (3 labs)" | |
| 4. trueness | |
| WHAT: Accuracy as recovery % or bias from Certified Reference Material (CRM). | |
| WHERE: Results/Validation. Keywords: "trueness", "accuracy", "CRM recovery", | |
| "bias", "% deviation", "% error". Distinguished from spiked recovery by CRM use. | |
| Example output: "98.5-101.2% (CRM: NIST SRM 1573a)" | |
| 5. recovery_me | |
| WHAT: Matrix recovery % and/or Matrix Effect (ME) % from spiked samples. | |
| WHERE: Results/Validation. Keywords: "recovery", "spiked samples", | |
| "matrix effect", "signal suppression/enhancement", "IS-corrected recovery". | |
| Example output: "Recovery: 89-110%, ME: -8 to +5%" | |
| 6. loq | |
| WHAT: Limit of Quantification numerical value with unit. | |
| WHERE: Any section. Keywords: "LOQ", "LLOQ", "limit of quantification", | |
| "quantification limit". Usually in a Figures of Merit or Validation table. | |
| Example output: "0.05 ug/mL" OR "0.1-2.5 ng/mL (analytes)" | |
| 7. working_range | |
| WHAT: Validated linear/dynamic range from LOQ to upper limit. | |
| WHERE: Any section. Keywords: "linear range", "working range", | |
| "dynamic range", "calibration range". | |
| Example output: "0.05-50 ug/mL" | |
| 8. linearity_r2 | |
| WHAT: Coefficient of determination R2 (or r2) for the calibration curve. | |
| WHERE: Any section. Keywords: "R2", "r2", "correlation coefficient", | |
| "linearity", "coefficient of determination". | |
| Example output: "R2 > 0.998" OR "r = 0.9995" | |
| 9. robustness_factors | |
| WHAT: Number of factors tested in a robustness/ruggedness study. | |
| WHERE: Results/Experimental. Keywords: "robustness", "ruggedness", | |
| "Plackett-Burman", "factorial design". Count the variables tested. | |
| Example output: "7 factors (pH, temperature, flow rate...)" | |
| 10. selectivity_interferents | |
| WHAT: Number of potential interferents explicitly tested for selectivity. | |
| WHERE: Results/Validation. Keywords: "selectivity", "specificity", | |
| "interferences", "interferents", "cross-reactivity". | |
| Example output: "12 interferents tested" OR "not tested" | |
| Return ONLY a valid JSON object exactly matching this schema: | |
| { | |
| """ + _SUMMARY_SCHEMA + """ | |
| "repeatability_rsd": "extracted value with context, or 'not tested'", | |
| "int_precision_rsd": "extracted value with context, or 'not tested'", | |
| "reproducibility_rsd": "extracted value with context, or 'not tested'", | |
| "trueness": "extracted value with context, or 'not tested'", | |
| "recovery_me": "extracted value with context, or 'not tested'", | |
| "loq": "numerical value + unit, or 'not tested'", | |
| "working_range": "range from-to with unit, or 'not tested'", | |
| "linearity_r2": "R2 value or range, or 'not tested'", | |
| "robustness_factors": "number of factors, or 'not tested'", | |
| "selectivity_interferents": "number of interferents, or 'not tested'", | |
| "_evidence": { | |
| "repeatability_rsd": {"quote": "verbatim sentence from paper", "reasoning": "why"}, | |
| "int_precision_rsd": {"quote": "...", "reasoning": "..."}, | |
| "loq": {"quote": "...", "reasoning": "..."} | |
| } | |
| } | |
| Do NOT add any text after the closing } brace. | |
| Document Text: | |
| ================""" | |
| def _clean_json(raw: str) -> dict: | |
| """Robust JSON extractor: handles fences, trailing text, truncation, ellipsis.""" | |
| import re | |
| raw = raw.strip() | |
| # 1. Strip markdown code fences | |
| if "```" in raw: | |
| parts = raw.split("```") | |
| for part in parts: | |
| cleaned = part.strip() | |
| if cleaned.startswith("json"): | |
| cleaned = cleaned[4:].strip() | |
| if cleaned.startswith("{"): | |
| raw = cleaned | |
| break | |
| raw = raw.strip() | |
| # 2. Remove any trailing text after the last closing brace of the root object | |
| # Walk from end to find balanced closing brace | |
| depth = 0 | |
| end_idx = -1 | |
| in_str = False | |
| escape = False | |
| for i, ch in enumerate(raw): | |
| if escape: | |
| escape = False | |
| continue | |
| if ch == '\\' and in_str: | |
| escape = True | |
| continue | |
| if ch == '"' and not escape: | |
| in_str = not in_str | |
| if not in_str: | |
| if ch == '{': | |
| depth += 1 | |
| elif ch == '}': | |
| depth -= 1 | |
| if depth == 0: | |
| end_idx = i | |
| break | |
| if end_idx != -1: | |
| raw = raw[:end_idx + 1] | |
| # 3. Remove ellipsis placeholders that break JSON (e.g. "...": ... or , ...) | |
| raw = re.sub(r',\s*\.{2,}\s*}', '}', raw) # trailing , ... } | |
| raw = re.sub(r',\s*\.{2,}\s*$', '', raw) # trailing , ... | |
| raw = re.sub(r'"\.\.\."|\.\.\.', '"..."', raw) # bare ellipsis β quoted | |
| # 4. Attempt direct parse first | |
| try: | |
| return json.loads(raw) | |
| except json.JSONDecodeError: | |
| pass | |
| # 5. Try to fix missing comma before _evidence key | |
| fixed = re.sub(r'"\s*\}\s*"_evidence"', '"}\n ,"_evidence"', raw) | |
| try: | |
| return json.loads(fixed) | |
| except json.JSONDecodeError: | |
| pass | |
| # 6. Strip the _evidence block entirely and return core data | |
| no_ev = re.sub(r',?\s*"_evidence"\s*:\s*\{[^}]*(?:\{[^}]*\}[^}]*)*\}', '', raw) | |
| try: | |
| return json.loads(no_ev) | |
| except json.JSONDecodeError: | |
| pass | |
| # 7. Last resort: load with aggressive cleanup (truncated JSON) | |
| # Try appending closing braces up to depth | |
| for extra in ['}', '}}', '}}}']: | |
| try: | |
| return json.loads(raw + extra) | |
| except json.JSONDecodeError: | |
| pass | |
| # Re-raise original error with context | |
| raise json.JSONDecodeError(f"Could not repair LLM JSON output", raw, 0) | |
| # βββ SHARED TARGET-TECHNIQUE OVERRIDE BUILDER βββββββββββββββββββββββββββββββ | |
| def _build_target_override(target_technique: str) -> str: | |
| """ | |
| Returns a prompt injection instructing the LLM to focus ONLY on the | |
| Experimental/Methods/Validation sections for the specified technique, | |
| and to ignore any mention of it in Introduction/Literature Review. | |
| """ | |
| if not target_technique.strip(): | |
| return "" | |
| return f""" | |
| π¨ CRITICAL FOCUS INSTRUCTION β READ BEFORE EXTRACTING: | |
| ============================================================= | |
| This article describes or compares MULTIPLE analytical methods. | |
| YOU MUST EXTRACT PARAMETERS **ONLY** FOR: '{target_technique}' | |
| STRICT RULES: | |
| β SOURCE all parameter values EXCLUSIVELY from these sections: | |
| β’ Experimental / Materials & Methods | |
| β’ Instrumentation & Apparatus | |
| β’ Sample Preparation / Procedure | |
| β’ Method Validation / Results & Discussion for '{target_technique}' specifically | |
| β DO NOT extract values from: | |
| β’ Introduction or Literature Review (even if '{target_technique}' is mentioned there) | |
| β’ Comparison tables that cite other authorsβ values β use ONLY in-study values | |
| β’ Any section describing a different technique, even if adjacent | |
| HANDLING MISSING DATA: | |
| - If a parameter is not reported for '{target_technique}' in this paper, set the value | |
| to its default/null equivalent (do NOT invent or borrow values from another technique). | |
| - In the _evidence quote field, write: "Not explicitly reported for {target_technique}" | |
| if you cannot find a verbatim quote. | |
| ============================================================= | |
| """ | |
| # βββ GEMINI PROVIDER βββββββββββββββββββββββββββββββββββββββββ | |
| class GeminiExtractor: | |
| def __init__(self, api_key: str): | |
| import google.generativeai as genai | |
| genai.configure(api_key=api_key) | |
| self.model = genai.GenerativeModel("gemini-2.0-flash") | |
| def extract_text(self, pdf_path: str) -> str: | |
| return _read_pdf(pdf_path) | |
| def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]: | |
| prompt_map = { | |
| "nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT, | |
| "ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT, | |
| "bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT, | |
| "scan_techniques": SCAN_TECHNIQUES_PROMPT, | |
| } | |
| base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT) | |
| target_override = _build_target_override(target_technique) | |
| prompt = base_prompt + target_override + text[:15000] + "\n================\nReturn ONLY the JSON object, no other text." | |
| response = self.model.generate_content(prompt) | |
| return _clean_json(response.text) | |
| # βββ GROQ PROVIDER (Llama 3 β FREE) βββββββββββββββββββββββββ | |
| class GroqExtractor: | |
| def __init__(self, api_key: str): | |
| from groq import Groq | |
| self.client = Groq(api_key=api_key) | |
| def extract_text(self, pdf_path: str) -> str: | |
| return _read_pdf(pdf_path) | |
| def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]: | |
| prompt_map = { | |
| "nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT, | |
| "ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT, | |
| "bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT, | |
| "scan_techniques": SCAN_TECHNIQUES_PROMPT, | |
| } | |
| base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT) | |
| target_override = _build_target_override(target_technique) | |
| prompt = base_prompt + target_override + text[:12000] + "\n================\nReturn ONLY the JSON object, no markdown, no extra text." | |
| response = self.client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[ | |
| {"role": "system", "content": "You are a JSON extraction assistant. Always respond with only valid JSON."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.1, | |
| max_tokens=2500, | |
| ) | |
| return _clean_json(response.choices[0].message.content) | |
| # βββ OPENAI PROVIDER βββββββββββββββββββββββββββββββββββββββββ | |
| class OpenAIExtractor: | |
| def __init__(self, api_key: str): | |
| from openai import OpenAI | |
| self.client = OpenAI(api_key=api_key) | |
| def extract_text(self, pdf_path: str) -> str: | |
| return _read_pdf(pdf_path) | |
| def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]: | |
| prompt_map = { | |
| "nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT, | |
| "ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT, | |
| "bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT, | |
| "scan_techniques": SCAN_TECHNIQUES_PROMPT, | |
| } | |
| base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT) | |
| target_override = _build_target_override(target_technique) | |
| prompt = base_prompt + target_override + text[:14000] + "\n================\nReturn ONLY the JSON object." | |
| response = self.client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": "You are a JSON extraction assistant. Always respond with only valid JSON."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.1, | |
| response_format={"type": "json_object"} | |
| ) | |
| return json.loads(response.choices[0].message.content) | |
| # βββ OPENROUTER (FREE) PROVIDER ββββββββββββββββββββββββββββββ | |
| class OpenRouterExtractor: | |
| def __init__(self, api_key: str): | |
| from openai import OpenAI | |
| self.client = OpenAI( | |
| base_url="https://openrouter.ai/api/v1", | |
| api_key=api_key, | |
| ) | |
| def extract_text(self, pdf_path: str) -> str: | |
| return _read_pdf(pdf_path) | |
| def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]: | |
| prompt_map = { | |
| "nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT, | |
| "ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT, | |
| "bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT, | |
| "scan_techniques": SCAN_TECHNIQUES_PROMPT, | |
| } | |
| base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT) | |
| target_override = _build_target_override(target_technique) | |
| prompt = base_prompt + target_override + text[:14000] + "\n================\nReturn ONLY the JSON object, no markdown." | |
| response = self.client.chat.completions.create( | |
| model="meta-llama/llama-3.3-70b-instruct:free", | |
| messages=[ | |
| {"role": "system", "content": "You are a JSON extraction assistant. Always respond with only valid JSON."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.1, | |
| max_tokens=3000, | |
| ) | |
| return _clean_json(response.choices[0].message.content) | |
| # βββ SHARED PDF READER ββββββββββββββββββββββββββββββββββββββββ | |
| def _read_pdf(pdf_path: str) -> str: | |
| text = "" | |
| with open(pdf_path, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| t = page.extract_text() | |
| if t: | |
| text += t + "\n" | |
| return text | |
| # βββ LOCAL LLM PROVIDER (LM Studio / Ollama β OpenAI-compatible) βββββββββββββ | |
| class LocalLLMExtractor: | |
| """ | |
| Connects to any local OpenAI-compatible server. | |
| api_key format: "<base_url>|<model_name>" | |
| Examples: | |
| LM Studio: "http://localhost:1234|lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF" | |
| Ollama: "http://localhost:11434|llama3" | |
| If the separator '|' is missing, the whole string is treated as base_url | |
| and model defaults to 'local-model'. | |
| """ | |
| DEFAULT_URLS = { | |
| "lmstudio": "http://localhost:1234", | |
| "ollama": "http://localhost:11434", | |
| } | |
| def __init__(self, api_key: str, provider_hint: str = "local"): | |
| from openai import OpenAI | |
| if "|" in api_key: | |
| base_url, self.model_name = api_key.split("|", 1) | |
| base_url = base_url.strip().rstrip("/") | |
| else: | |
| # api_key may be empty or just a plain URL | |
| base_url = api_key.strip().rstrip("/") or self.DEFAULT_URLS.get(provider_hint, "http://localhost:1234") | |
| self.model_name = "local-model" | |
| # Ensure the path ends with /v1 (required for OpenAI-compat) | |
| if not base_url.endswith("/v1"): | |
| base_url = base_url + "/v1" | |
| self.client = OpenAI(base_url=base_url, api_key="not-needed") | |
| def extract_text(self, pdf_path: str) -> str: | |
| return _read_pdf(pdf_path) | |
| def analyze_document(self, text: str, analysis_type: str = "agree", target_technique: str = "") -> Dict[str, Any]: | |
| prompt_map = { | |
| "nqs": NQS_PROMPT, "complexmogapi": COMPLEXMOGAPI_PROMPT, | |
| "ecoscale": ECOSCALE_PROMPT, "cafri": CAFRI_PROMPT, | |
| "bagi": BAGI_PROMPT, "rapi": RAPI_PROMPT, | |
| "scan_techniques": SCAN_TECHNIQUES_PROMPT, | |
| } | |
| base_prompt = prompt_map.get(analysis_type, EXTRACTION_PROMPT) | |
| target_override = "" | |
| if target_technique.strip(): | |
| target_override = ( | |
| f"\n\nπ¨ TARGET TECHNIQUE OVERRIDE:\nThe article discusses the development of more than one technique. " | |
| f"YOU MUST EXTRACT PARAMETERS ONLY FOR THE SPECIFIC TECHNIQUE: '{target_technique}'. " | |
| f"DO NOT extract parameters for other methods compared in the study.\n" | |
| ) | |
| prompt = base_prompt + target_override + text[:12000] + "\n================\nReturn ONLY the JSON object, no markdown, no extra text." | |
| response = self.client.chat.completions.create( | |
| model=self.model_name, | |
| messages=[ | |
| {"role": "system", "content": "You are a JSON extraction assistant. Always respond with only valid JSON."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.1, | |
| max_tokens=2500, | |
| ) | |
| return _clean_json(response.choices[0].message.content) | |
| # βββ FACTORY βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def PDFExtractor(provider: str, api_key: str): | |
| """Factory: returns the right extractor for the chosen provider.""" | |
| p = provider.lower() | |
| if p in ("lmstudio", "ollama", "local"): | |
| return LocalLLMExtractor(api_key, provider_hint=p) | |
| providers = { | |
| "gemini": GeminiExtractor, | |
| "groq": GroqExtractor, | |
| "openai": OpenAIExtractor, | |
| "openrouter": OpenRouterExtractor, | |
| } | |
| cls = providers.get(p) | |
| if not cls: | |
| raise ValueError(f"Unknown provider '{provider}'. Choose from: {list(providers.keys()) + ['lmstudio', 'ollama']}") | |
| return cls(api_key) | |