Aramente commited on
Commit
bc01feb
·
1 Parent(s): 7cd7958

fix: robust JSON extraction from Gemini 2.5 Flash — find JSON in response, strip thinking, fix trailing commas

Browse files
app/routers/linkedin.py CHANGED
@@ -48,7 +48,7 @@ async def debug_parse_pdf(file: UploadFile = File(...)):
48
 
49
  import google.generativeai as genai
50
  genai.configure(api_key=api_key)
51
- model = genai.GenerativeModel("gemini-2.0-flash-lite")
52
 
53
  prompt = f"""Extract structured profile data from this LinkedIn PDF export.
54
 
@@ -66,7 +66,12 @@ Return valid JSON with: name, title, email, phone, linkedin, location, summary,
66
  response_mime_type="application/json",
67
  ),
68
  )
69
- data = json.loads(r.text)
 
 
 
 
 
70
  return {
71
  "ok": True,
72
  "name": data.get("name"),
 
48
 
49
  import google.generativeai as genai
50
  genai.configure(api_key=api_key)
51
+ model = genai.GenerativeModel("gemini-2.5-flash")
52
 
53
  prompt = f"""Extract structured profile data from this LinkedIn PDF export.
54
 
 
66
  response_mime_type="application/json",
67
  ),
68
  )
69
+ import re as re_mod
70
+ raw_resp = r.text.strip()
71
+ start = raw_resp.find("{")
72
+ end = raw_resp.rfind("}") + 1
73
+ json_str = re_mod.sub(r",\s*([}\]])", r"\1", raw_resp[start:end])
74
+ data = json.loads(json_str)
75
  return {
76
  "ok": True,
77
  "name": data.get("name"),
app/services/pdf_parser.py CHANGED
@@ -31,7 +31,7 @@ def parse_linkedin_pdf(pdf_bytes: bytes) -> Profile:
31
  return _fallback_parse(raw_text)
32
 
33
  genai.configure(api_key=api_key)
34
- model = genai.GenerativeModel("gemini-2.0-flash-lite") # No thinking — reliable JSON, faster
35
 
36
  prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure.
37
 
@@ -82,10 +82,20 @@ IMPORTANT:
82
  generation_config=genai.types.GenerationConfig(
83
  max_output_tokens=4000,
84
  temperature=0.1,
85
- response_mime_type="application/json", # Forces valid JSON output
86
  ),
87
  )
88
- data = json.loads(response.text)
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  def s(val: str | None) -> str:
91
  """Safe string — convert None/null to empty string."""
 
31
  return _fallback_parse(raw_text)
32
 
33
  genai.configure(api_key=api_key)
34
+ model = genai.GenerativeModel("gemini-2.5-flash")
35
 
36
  prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure.
37
 
 
82
  generation_config=genai.types.GenerationConfig(
83
  max_output_tokens=4000,
84
  temperature=0.1,
 
85
  ),
86
  )
87
+ # Extract JSON from response — may have markdown blocks or thinking preamble
88
+ raw_resp = response.text.strip()
89
+ # Find the JSON object in the response
90
+ start = raw_resp.find("{")
91
+ end = raw_resp.rfind("}") + 1
92
+ if start == -1 or end == 0:
93
+ raise ValueError("No JSON found in response")
94
+ json_str = raw_resp[start:end]
95
+ # Fix common issues
96
+ import re
97
+ json_str = re.sub(r",\s*([}\]])", r"\1", json_str) # trailing commas
98
+ data = json.loads(json_str)
99
 
100
  def s(val: str | None) -> str:
101
  """Safe string — convert None/null to empty string."""