Victor Gerardo Rivera commited on
Commit
a63e7fe
·
1 Parent(s): 1b379e9

Refine AI extraction questions and logging

Browse files
Files changed (1) hide show
  1. services/parser.py +37 -34
services/parser.py CHANGED
@@ -22,7 +22,7 @@ class LabReportParser:
22
  self.hf_token = hf_token or os.getenv("HF_TOKEN")
23
 
24
  async def extract_data(self, file_content: str, file_name: str) -> LabReportData:
25
- # 1. Ensure we have an image even if it's a PDF
26
  image_to_process = None
27
  try:
28
  image_data = base64.b64decode(file_content)
@@ -30,59 +30,62 @@ class LabReportParser:
30
  from pdf2image import convert_from_bytes
31
  images = convert_from_bytes(image_data)
32
  if images:
33
- # Use the first page for extraction
34
  buffered = io.BytesIO()
35
- images[0].save(buffered, format="JPEG")
36
  image_to_process = base64.b64encode(buffered.getvalue()).decode("utf-8")
37
  else:
38
- image_to_process = file_content # Already an image
39
  except Exception as e:
40
- print(f"Error preparing file for extraction: {e}")
41
 
42
- # 2. Use the Inference API for real extraction
43
  if self.hf_token and image_to_process:
44
  try:
45
  import requests
46
  API_URL = "https://api-inference.huggingface.co/models/impira/layoutlm-document-qa"
47
  headers = {"Authorization": f"Bearer {self.hf_token}"}
48
 
49
- def query(question):
50
- payload = {
51
- "inputs": {
52
- "image": image_to_process,
53
- "question": question
54
- }
55
- }
56
- response = requests.post(API_URL, headers=headers, json=payload)
57
- return response.json()
58
-
59
- # Attempt real extractions
60
- strain_resp = query("What is the strain name?")
61
- thc_resp = query("What is the Total THC percentage?")
62
- cbd_resp = query("What is the Total CBD percentage?")
63
 
64
- # If we get credible answers, we build a real object
65
- if isinstance(strain_resp, list) and len(strain_resp) > 0:
66
- real_strain = strain_resp[0].get("answer", "Blue Dream")
67
- real_thc = 0.0
68
- if isinstance(thc_resp, list) and len(thc_resp) > 0:
 
 
 
 
 
69
  try:
70
- # Clean "22.5%" -> 22.5
71
- thc_str = thc_resp[0].get("answer", "0").replace("%", "").strip()
72
- real_thc = float(thc_str)
73
  except: pass
74
 
 
75
  return LabReportData(
76
- strain_name=real_strain,
77
- cannabinoids=[Cannabinoid(name="Total THC", value=real_thc)],
 
 
 
 
 
78
  file_name=file_name,
79
- confidence=strain_resp[0].get("score", 0.0),
80
- source_type="api"
81
  )
82
  except Exception as e:
83
- print(f"Inference API failed, falling back to mock: {e}")
84
 
85
- # Fallback to mock
86
  return self._mock_extraction(file_name)
87
 
88
  def _mock_extraction(self, file_name: str) -> LabReportData:
 
22
  self.hf_token = hf_token or os.getenv("HF_TOKEN")
23
 
24
  async def extract_data(self, file_content: str, file_name: str) -> LabReportData:
25
+ # 1. Ensure we have an image
26
  image_to_process = None
27
  try:
28
  image_data = base64.b64decode(file_content)
 
30
  from pdf2image import convert_from_bytes
31
  images = convert_from_bytes(image_data)
32
  if images:
 
33
  buffered = io.BytesIO()
34
+ images[0].save(buffered, format="JPEG", quality=90)
35
  image_to_process = base64.b64encode(buffered.getvalue()).decode("utf-8")
36
  else:
37
+ image_to_process = file_content
38
  except Exception as e:
39
+ print(f"Error preparing file: {e}")
40
 
41
+ # 2. Inference API
42
  if self.hf_token and image_to_process:
43
  try:
44
  import requests
45
  API_URL = "https://api-inference.huggingface.co/models/impira/layoutlm-document-qa"
46
  headers = {"Authorization": f"Bearer {self.hf_token}"}
47
 
48
+ results = {}
49
+ questions = {
50
+ "strain": "What is the strain name or sample name?",
51
+ "thc": "What is the total THC percentage?",
52
+ "cbd": "What is the total CBD percentage?",
53
+ "lab": "What is the name of the lab?",
54
+ "date": "What is the test date?"
55
+ }
 
 
 
 
 
 
56
 
57
+ for key, q in questions.items():
58
+ payload = {"inputs": {"image": image_to_process, "question": q}}
59
+ resp = requests.post(API_URL, headers=headers, json=payload).json()
60
+ if isinstance(resp, list) and len(resp) > 0:
61
+ results[key] = resp[0]
62
+
63
+ if "strain" in results:
64
+ strain_name = results["strain"].get("answer", "Unknown Strain")
65
+ thc_val = 0.0
66
+ if "thc" in results:
67
  try:
68
+ # Extract number like "22.5" from "22.5%"
69
+ val_str = "".join(c for c in results["thc"]["answer"] if c.isdigit() or c == ".")
70
+ thc_val = float(val_str)
71
  except: pass
72
 
73
+ print(f"AI Success: {strain_name} - THC: {thc_val}%")
74
  return LabReportData(
75
+ strain_name=strain_name,
76
+ lab_name=results.get("lab", {}).get("answer"),
77
+ test_date=results.get("date", {}).get("answer"),
78
+ cannabinoids=[
79
+ Cannabinoid(name="Total THC", value=thc_val),
80
+ Cannabinoid(name="Total CBD", value=0.0) # simplify for now
81
+ ],
82
  file_name=file_name,
83
+ confidence=results["strain"].get("score", 0.0),
84
+ source_type="ai_real"
85
  )
86
  except Exception as e:
87
+ print(f"AI Extraction failed: {e}")
88
 
 
89
  return self._mock_extraction(file_name)
90
 
91
  def _mock_extraction(self, file_name: str) -> LabReportData: