Spaces:
Running
Running
fix: robust JSON extraction from Gemini 2.5 Flash — find JSON in response, strip thinking, fix trailing commas
Browse files- app/routers/linkedin.py +7 -2
- app/services/pdf_parser.py +13 -3
app/routers/linkedin.py
CHANGED
|
@@ -48,7 +48,7 @@ async def debug_parse_pdf(file: UploadFile = File(...)):
|
|
| 48 |
|
| 49 |
import google.generativeai as genai
|
| 50 |
genai.configure(api_key=api_key)
|
| 51 |
-
model = genai.GenerativeModel("gemini-2.
|
| 52 |
|
| 53 |
prompt = f"""Extract structured profile data from this LinkedIn PDF export.
|
| 54 |
|
|
@@ -66,7 +66,12 @@ Return valid JSON with: name, title, email, phone, linkedin, location, summary,
|
|
| 66 |
response_mime_type="application/json",
|
| 67 |
),
|
| 68 |
)
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
return {
|
| 71 |
"ok": True,
|
| 72 |
"name": data.get("name"),
|
|
|
|
| 48 |
|
| 49 |
import google.generativeai as genai
|
| 50 |
genai.configure(api_key=api_key)
|
| 51 |
+
model = genai.GenerativeModel("gemini-2.5-flash")
|
| 52 |
|
| 53 |
prompt = f"""Extract structured profile data from this LinkedIn PDF export.
|
| 54 |
|
|
|
|
| 66 |
response_mime_type="application/json",
|
| 67 |
),
|
| 68 |
)
|
| 69 |
+
import re as re_mod
|
| 70 |
+
raw_resp = r.text.strip()
|
| 71 |
+
start = raw_resp.find("{")
|
| 72 |
+
end = raw_resp.rfind("}") + 1
|
| 73 |
+
json_str = re_mod.sub(r",\s*([}\]])", r"\1", raw_resp[start:end])
|
| 74 |
+
data = json.loads(json_str)
|
| 75 |
return {
|
| 76 |
"ok": True,
|
| 77 |
"name": data.get("name"),
|
app/services/pdf_parser.py
CHANGED
|
@@ -31,7 +31,7 @@ def parse_linkedin_pdf(pdf_bytes: bytes) -> Profile:
|
|
| 31 |
return _fallback_parse(raw_text)
|
| 32 |
|
| 33 |
genai.configure(api_key=api_key)
|
| 34 |
-
model = genai.GenerativeModel("gemini-2.
|
| 35 |
|
| 36 |
prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure.
|
| 37 |
|
|
@@ -82,10 +82,20 @@ IMPORTANT:
|
|
| 82 |
generation_config=genai.types.GenerationConfig(
|
| 83 |
max_output_tokens=4000,
|
| 84 |
temperature=0.1,
|
| 85 |
-
response_mime_type="application/json", # Forces valid JSON output
|
| 86 |
),
|
| 87 |
)
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
def s(val: str | None) -> str:
|
| 91 |
"""Safe string — convert None/null to empty string."""
|
|
|
|
| 31 |
return _fallback_parse(raw_text)
|
| 32 |
|
| 33 |
genai.configure(api_key=api_key)
|
| 34 |
+
model = genai.GenerativeModel("gemini-2.5-flash")
|
| 35 |
|
| 36 |
prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure.
|
| 37 |
|
|
|
|
| 82 |
generation_config=genai.types.GenerationConfig(
|
| 83 |
max_output_tokens=4000,
|
| 84 |
temperature=0.1,
|
|
|
|
| 85 |
),
|
| 86 |
)
|
| 87 |
+
# Extract JSON from response — may have markdown blocks or thinking preamble
|
| 88 |
+
raw_resp = response.text.strip()
|
| 89 |
+
# Find the JSON object in the response
|
| 90 |
+
start = raw_resp.find("{")
|
| 91 |
+
end = raw_resp.rfind("}") + 1
|
| 92 |
+
if start == -1 or end == 0:
|
| 93 |
+
raise ValueError("No JSON found in response")
|
| 94 |
+
json_str = raw_resp[start:end]
|
| 95 |
+
# Fix common issues
|
| 96 |
+
import re
|
| 97 |
+
json_str = re.sub(r",\s*([}\]])", r"\1", json_str) # trailing commas
|
| 98 |
+
data = json.loads(json_str)
|
| 99 |
|
| 100 |
def s(val: str | None) -> str:
|
| 101 |
"""Safe string — convert None/null to empty string."""
|