File size: 4,676 Bytes
fd027e9
 
 
b927a18
fd027e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b927a18
fd027e9
 
4bcd249
 
09a2755
 
1b379e9
4bcd249
1b379e9
 
 
 
 
09a2755
4bcd249
 
 
1b379e9
09a2755
4bcd249
 
 
 
 
 
1b379e9
4bcd249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b379e9
4bcd249
 
09a2755
b927a18
 
 
 
4bcd249
 
 
 
 
 
 
b927a18
4bcd249
 
d854934
4bcd249
 
 
 
 
 
 
 
 
 
 
 
 
b927a18
b121807
4bcd249
fd027e9
4bcd249
09a2755
4bcd249
 
 
fd027e9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import base64
import json
import io
import os
from PIL import Image
from typing import Dict, Any, List, Optional
try:
    from transformers import pipeline
    HAS_TRANSFORMERS = True
except ImportError:
    HAS_TRANSFORMERS = False
    print("Warning: transformers not found. Using mock extraction.")
from models import LabReportData, Cannabinoid, Terpene

# Simulation of Hugging Face pipeline for document understanding
# In reality, this would use: pipeline("document-question-answering", model="impira/layoutlm-document-qa")
# Or a multimodal model like Donut: model="naver-clova-ix/donut-base-finetuned-docvqa"

class LabReportParser:
    def __init__(self, use_remote_api: bool = False, hf_token: Optional[str] = None):
        self.use_remote_api = use_remote_api
        self.hf_token = hf_token or os.getenv("HF_TOKEN")
        
    async def extract_data(self, file_content: str, file_name: str) -> LabReportData:
        # 1. Prepare Images and OCR Text
        raw_text = ""
        image_to_process_b64 = None
        pil_image = None
        try:
            import pytesseract
            image_data = base64.b64decode(file_content)
            if file_name.lower().endswith('.pdf'):
                from pdf2image import convert_from_bytes
                images = convert_from_bytes(image_data)
                if images:
                    pil_image = images[0]
                    # Also extract text from all pages to be safe
                    for img in images[:2]: # First 2 pages
                        raw_text += pytesseract.image_to_string(img)
            else:
                pil_image = Image.open(io.BytesIO(image_data))
                raw_text = pytesseract.image_to_string(pil_image)

            if pil_image:
                buffered = io.BytesIO()
                pil_image.save(buffered, format="JPEG", quality=95)
                image_to_process_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
        except Exception as e:
            print(f"OCR/Image Pre-processing Error: {e}")

        # 2. Extract Using RegEx (Fast & Reliable for Numbers)
        import re
        thc_val = 0.0
        cbd_val = 0.0
        
        # Look for THC/CBD patterns
        thc_match = re.search(r'(?:Total\s*THC|THC\s*Total|Potency)[:\s]*([\d\.]+)', raw_text, re.I)
        if thc_match:
            try: thc_val = float(thc_match.group(1))
            except: pass
            
        cbd_match = re.search(r'(?:Total\s*CBD|CBD\s*Total)[:\s]*([\d\.]+)', raw_text, re.I)
        if cbd_match:
            try: cbd_val = float(cbd_match.group(1))
            except: pass

        # 3. Use Inference API for Naming (Context is better there)
        strain_name = file_name.replace(".pdf", "").replace(".png", "").replace(".jpg", "").title()
        if self.hf_token and image_to_process_b64:
            try:
                import requests
                API_URL = "https://api-inference.huggingface.co/models/impira/layoutlm-document-qa"
                headers = {"Authorization": f"Bearer {self.hf_token}"}
                payload = {"inputs": {"image": image_to_process_b64, "question": "What is the strain name?"}}
                resp = requests.post(API_URL, headers=headers, json=payload).json()
                if isinstance(resp, list) and len(resp) > 0:
                    answer = resp[0].get("answer", "").title()
                    if len(answer) > 2 and "unknown" not in answer.lower():
                        strain_name = answer
            except: pass

        # 4. Final Data Assembly
        print(f"DEBUG: OCR Extraction - THC: {thc_val}%, Name: {strain_name}")
        
        # If OCR got data, or we use a smart fallback
        return LabReportData(
            strain_name=strain_name,
            strain_type="Hybrid", # default
            cannabinoids=[
                Cannabinoid(name="Total THC", value=thc_val, unit="%"),
                Cannabinoid(name="Total CBD", value=cbd_val, unit="%")
            ],
            terpenes=[],
            file_name=file_name,
            confidence=0.8 if thc_val > 0 else 0.5,
            source_type="ai_hybrid"
        )

    def _empty_extraction(self, file_name: str) -> LabReportData:
        # Not needed anymore as extract_data is now more robust, but kept for compatibility
        return LabReportData(
            strain_name=file_name.split(".")[0].title(),
            cannabinoids=[Cannabinoid(name="Total THC", value=0.0)],
            confidence=0.0,
            file_name=file_name,
            source_type="error"
        )

    async def normalize(self, data: Dict[str, Any]) -> LabReportData:
        return LabReportData(**data)