Spaces:
Running
Running
Update app/services/extraction.py
Browse files- app/services/extraction.py +107 -61
app/services/extraction.py
CHANGED
|
@@ -346,10 +346,11 @@ Output: {{
|
|
| 346 |
---
|
| 347 |
Now process this input:
|
| 348 |
Input: "{text}"
|
| 349 |
-
Output:
|
| 350 |
"""}
|
| 351 |
]
|
| 352 |
|
|
|
|
| 353 |
prompt = ""
|
| 354 |
for message in messages:
|
| 355 |
if message["role"] == "system":
|
|
@@ -362,66 +363,24 @@ Output: ONLY provide the JSON data with no additional formatting, markdowns, or
|
|
| 362 |
|
| 363 |
# Generate response
|
| 364 |
response = model.generate_content(prompt,
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
|
| 371 |
-
# Process the response
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
elif hasattr(response, 'candidates') and response.candidates:
|
| 381 |
-
parts = response.candidates[0].content.parts
|
| 382 |
-
output_text = ''.join([part.text for part in parts if hasattr(part, 'text')])
|
| 383 |
-
else:
|
| 384 |
-
# Last resort: try to find any text in the response
|
| 385 |
-
output_text = str(response)
|
| 386 |
-
if not '{' in output_text: # Check if it looks like JSON
|
| 387 |
-
# If we can't extract valid text, return a simple fallback
|
| 388 |
-
return {
|
| 389 |
-
"category": "",
|
| 390 |
-
"calories": "",
|
| 391 |
-
"time": "",
|
| 392 |
-
"ingredients": ["ingredient from " + text],
|
| 393 |
-
"keywords": ["keyword from " + text],
|
| 394 |
-
"keywords_name": []
|
| 395 |
-
}
|
| 396 |
-
except Exception as e:
|
| 397 |
-
print(f"Error processing Gemini response: {e}")
|
| 398 |
-
# Log additional information about the response for debugging
|
| 399 |
-
print(f"Response type: {type(response)}")
|
| 400 |
-
print(f"Response attributes: {dir(response)}")
|
| 401 |
-
return {
|
| 402 |
-
"error": f"Failed to process Gemini response: {str(e)}",
|
| 403 |
-
"ingredients": [text], # Include the search text as an ingredient for fallback
|
| 404 |
-
"keywords": [text],
|
| 405 |
-
"keywords_name": []
|
| 406 |
-
}
|
| 407 |
|
| 408 |
try:
|
| 409 |
-
# Extract JSON from markdown code blocks if needed
|
| 410 |
-
if output_text.strip().startswith('```') and '```' in output_text:
|
| 411 |
-
# Extract the content between the code block markers
|
| 412 |
-
import re
|
| 413 |
-
match = re.search(r'```(?:json)?\n(.*?)\n```', output_text, re.DOTALL)
|
| 414 |
-
if match:
|
| 415 |
-
output_text = match.group(1).strip()
|
| 416 |
-
else:
|
| 417 |
-
# Try another pattern without the language specification
|
| 418 |
-
match = re.search(r'```(.*?)```', output_text, re.DOTALL)
|
| 419 |
-
if match:
|
| 420 |
-
output_text = match.group(1).strip()
|
| 421 |
-
|
| 422 |
-
# Now parse the cleaned JSON
|
| 423 |
result = json.loads(output_text)
|
| 424 |
-
|
| 425 |
# Update category with closest match from dataset
|
| 426 |
original_category = result["category"]
|
| 427 |
matched_category = find_closest_category(original_category)
|
|
@@ -438,11 +397,98 @@ Output: ONLY provide the JSON data with no additional formatting, markdowns, or
|
|
| 438 |
result["keywords_name"] = result.get("keywords_name", []) + ["beverages", "caffeinated", "coffee"]
|
| 439 |
result["ingredients"] = result.get("ingredients", []) + ["coffee beans", "water"]
|
| 440 |
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
-
except json.JSONDecodeError
|
| 444 |
-
print(f"JSON parse error: {e}")
|
| 445 |
-
print(f"Failed to parse: {output_text}")
|
| 446 |
result = {"error": "Failed to parse JSON", "output": output_text}
|
| 447 |
|
| 448 |
return result
|
|
|
|
| 346 |
---
|
| 347 |
Now process this input:
|
| 348 |
Input: "{text}"
|
| 349 |
+
Output:
|
| 350 |
"""}
|
| 351 |
]
|
| 352 |
|
| 353 |
+
# Send the prompt to OpenAI API
|
| 354 |
prompt = ""
|
| 355 |
for message in messages:
|
| 356 |
if message["role"] == "system":
|
|
|
|
| 363 |
|
| 364 |
# Generate response
|
| 365 |
response = model.generate_content(prompt,
|
| 366 |
+
generation_config=genai.types.GenerationConfig(
|
| 367 |
+
temperature=0,
|
| 368 |
+
max_output_tokens=150,
|
| 369 |
+
top_p=1
|
| 370 |
+
))
|
| 371 |
|
| 372 |
+
# Process the response
|
| 373 |
+
output_text = response.text.strip()
|
| 374 |
+
|
| 375 |
+
# Check if output is wrapped in markdown code blocks and extract the JSON if needed
|
| 376 |
+
if output_text.strip().startswith('```') and '```' in output_text:
|
| 377 |
+
import re
|
| 378 |
+
match = re.search(r'```(?:json)?\n(.*?)\n```', output_text, re.DOTALL)
|
| 379 |
+
if match:
|
| 380 |
+
output_text = match.group(1).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
result = json.loads(output_text)
|
|
|
|
| 384 |
# Update category with closest match from dataset
|
| 385 |
original_category = result["category"]
|
| 386 |
matched_category = find_closest_category(original_category)
|
|
|
|
| 397 |
result["keywords_name"] = result.get("keywords_name", []) + ["beverages", "caffeinated", "coffee"]
|
| 398 |
result["ingredients"] = result.get("ingredients", []) + ["coffee beans", "water"]
|
| 399 |
|
| 400 |
+
elif "smoothie bowl" in text.lower():
|
| 401 |
+
result["keywords"] = result.get("keywords", []) + ["beverages", "healthy", "smoothie bowl"]
|
| 402 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["beverages", "smoothie bowl"]
|
| 403 |
+
result["ingredients"] = result.get("ingredients", []) + ["fruits", "yogurt", "granola"]
|
| 404 |
+
|
| 405 |
+
elif "kombucha" in text.lower():
|
| 406 |
+
result["keywords"] = result.get("keywords", []) + ["beverage", "fermented", "kombucha"]
|
| 407 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["beverages", "kombucha"]
|
| 408 |
+
result["ingredients"] = result.get("ingredients", []) + ["tea", "sugar", "SCOBY"]
|
| 409 |
+
|
| 410 |
+
elif "herbal tea" in text.lower():
|
| 411 |
+
result["keywords"] = result.get("keywords", []) + ["beverages", "caffeine-free", "herbal tea"]
|
| 412 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["beverages", "herbal tea"]
|
| 413 |
+
result["ingredients"] = result.get("ingredients", []) + ["herbs", "water"]
|
| 414 |
+
|
| 415 |
+
elif "seaweed" in text.lower():
|
| 416 |
+
result["keywords"] = result.get("keywords", []) + ["ingredient", "seafood", "seaweed"]
|
| 417 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["seaweed"]
|
| 418 |
+
result["ingredients"] = result.get("ingredients", []) + ["seaweed"]
|
| 419 |
+
|
| 420 |
+
elif "vegan cheese" in text.lower():
|
| 421 |
+
result["keywords"] = result.get("keywords", []) + ["dairy-free", "vegan", "cheese"]
|
| 422 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["vegan cheese"]
|
| 423 |
+
result["ingredients"] = result.get("ingredients", []) + ["cashews", "nutritional yeast", "coconut oil"]
|
| 424 |
+
|
| 425 |
+
elif "air fryer" in text.lower():
|
| 426 |
+
result["keywords"] = result.get("keywords", []) + ["cooking method", "air fryer", "healthy"]
|
| 427 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["air fryer"]
|
| 428 |
+
result["ingredients"] = result.get("ingredients", []) # Ingredients vary with recipe, left blank
|
| 429 |
+
|
| 430 |
+
elif "instant pot" in text.lower():
|
| 431 |
+
result["keywords"] = result.get("keywords", []) + ["cooking method", "instant pot", "pressure cooker"]
|
| 432 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["instant pot"]
|
| 433 |
+
result["ingredients"] = result.get("ingredients", []) # Ingredients vary with recipe, left blank
|
| 434 |
+
|
| 435 |
+
elif "sous vide" in text.lower():
|
| 436 |
+
result["keywords"] = result.get("keywords", []) + ["cooking method", "sous vide", "precision cooking"]
|
| 437 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["sous vide"]
|
| 438 |
+
result["ingredients"] = result.get("ingredients", []) # Ingredients vary with recipe, left blank
|
| 439 |
+
|
| 440 |
+
elif "paleo" in text.lower():
|
| 441 |
+
result["keywords"] = result.get("keywords", []) + ["diet", "paleo", "low-carb"]
|
| 442 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["paleo"]
|
| 443 |
+
result["ingredients"] = result.get("ingredients", []) # Ingredients vary with recipe, left blank
|
| 444 |
+
|
| 445 |
+
elif "fodmap" in text.lower():
|
| 446 |
+
result["keywords"] = result.get("keywords", []) + ["diet", "fodmap", "digestive health"]
|
| 447 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["fodmap"]
|
| 448 |
+
result["ingredients"] = result.get("ingredients", []) # Ingredients vary with recipe, left blank
|
| 449 |
+
|
| 450 |
+
elif "cold brew" in text.lower():
|
| 451 |
+
result["keywords"] = result.get("keywords", []) + ["beverages", "caffeinated", "cold coffee"]
|
| 452 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["beverages", "cold brew"]
|
| 453 |
+
result["ingredients"] = result.get("ingredients", []) + ["coffee grounds", "water"]
|
| 454 |
+
|
| 455 |
+
elif "matcha" in text.lower():
|
| 456 |
+
result["keywords"] = result.get("keywords", []) + ["beverages", "green tea", "matcha"]
|
| 457 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["beverages", "matcha"]
|
| 458 |
+
result["ingredients"] = result.get("ingredients", []) + ["matcha powder", "water", "milk"]
|
| 459 |
+
|
| 460 |
+
elif "smoothie" in text.lower():
|
| 461 |
+
result["keywords"] = result.get("keywords", []) + ["beverages", "healthy", "smoothie"]
|
| 462 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["beverages", "smoothie"]
|
| 463 |
+
result["ingredients"] = result.get("ingredients", []) + ["fruits", "milk", "yogurt"]
|
| 464 |
+
|
| 465 |
+
elif "protein shake" in text.lower():
|
| 466 |
+
result["keywords"] = result.get("keywords", []) + ["beverages", "high protein", "shake"]
|
| 467 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["beverages", "protein shake"]
|
| 468 |
+
result["ingredients"] = result.get("ingredients", []) + ["protein powder", "milk", "banana"]
|
| 469 |
+
|
| 470 |
+
elif "oat milk" in text.lower() or "almond milk" in text.lower():
|
| 471 |
+
result["keywords"] = result.get("keywords", []) + ["dairy-free", "vegan", "plant-based milk"]
|
| 472 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["oat milk" if "oat" in text.lower() else "almond milk"]
|
| 473 |
+
result["ingredients"] = result.get("ingredients", []) + ["oats" if "oat" in text.lower() else "almonds", "water"]
|
| 474 |
+
|
| 475 |
+
elif "zoodles" in text.lower():
|
| 476 |
+
result["keywords"] = result.get("keywords", []) + ["low carb", "gluten-free", "vegetable noodles", "noodles"]
|
| 477 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["zoodles", "noodles"]
|
| 478 |
+
result["ingredients"] = result.get("ingredients", []) + ["zucchini"]
|
| 479 |
+
|
| 480 |
+
elif "avocado toast" in text.lower():
|
| 481 |
+
result["keywords"] = result.get("keywords", []) + ["breakfast", "healthy", "avocado"]
|
| 482 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["avocado toast"]
|
| 483 |
+
result["ingredients"] = result.get("ingredients", []) + ["avocado", "bread"]
|
| 484 |
+
|
| 485 |
+
elif "golden milk" in text.lower():
|
| 486 |
+
result["keywords"] = result.get("keywords", []) + ["beverage", "turmeric", "anti-inflammatory"]
|
| 487 |
+
result["keywords_name"] = result.get("keywords_name", []) + ["golden milk"]
|
| 488 |
+
result["ingredients"] = result.get("ingredients", []) + ["turmeric", "milk", "honey", "spices"]
|
| 489 |
+
# other cases...
|
| 490 |
|
| 491 |
+
except json.JSONDecodeError:
|
|
|
|
|
|
|
| 492 |
result = {"error": "Failed to parse JSON", "output": output_text}
|
| 493 |
|
| 494 |
return result
|