Spaces:
Running
🕷️ Intelligent Web Scraping for Nutrition Recommendations
Browse files✨ Revolutionary Features:
• Real-time web scraping from trusted sources (CDC, NIH, Mayo Clinic)
• Intelligent content extraction and summarization
• Query-specific URL selection based on user intent
• Cached responses for performance (1-hour cache)
• Contextual source recommendations with credibility scores
🧠 Smart Intelligence:
• 'I want to lose weight' → Scrapes CDC weight loss guidelines + NIH resources
• 'Heart health' → Scrapes AHA + DASH diet information
• 'Diabetes' → Scrapes CDC diabetes nutrition + ADA resources
• Auto-summarizes scraped content based on user query
🔧 Technical Improvements:
• Added BeautifulSoup4, lxml, html5lib for robust web scraping
• Concurrent scraping of multiple URLs with aiohttp
• Content extraction from headings, paragraphs, and lists
• Intelligent summarization based on query relevance
• Fallback to static responses if scraping fails
🎯 Result: Users get current, evidence-based nutrition information directly from trusted health organizations, not static responses!
- app.py +318 -85
- requirements.txt +4 -1
|
@@ -16,6 +16,10 @@ import urllib.request
|
|
| 16 |
import requests
|
| 17 |
import asyncio
|
| 18 |
import aiohttp
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# Initialize FastAPI app
|
| 21 |
app = FastAPI(
|
|
@@ -656,8 +660,8 @@ async def get_nutrition_info(request: NutritionRequest):
|
|
| 656 |
try:
|
| 657 |
query = request.query.lower().strip()
|
| 658 |
|
| 659 |
-
# Generate nutrition response
|
| 660 |
-
nutrition_info =
|
| 661 |
|
| 662 |
return NutritionResponse(
|
| 663 |
status="success",
|
|
@@ -707,94 +711,323 @@ async def record_user_feedback(request: UserFeedbackRequest):
|
|
| 707 |
error=str(e)
|
| 708 |
)
|
| 709 |
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
# Define trusted sources
|
| 716 |
-
trusted_sources = [
|
| 717 |
-
{
|
| 718 |
-
"title": "Nutrition.gov - Official Nutrition Information",
|
| 719 |
-
"url": "https://www.nutrition.gov/",
|
| 720 |
-
"domain": "nutrition.gov",
|
| 721 |
-
"credibility_score": 0.95
|
| 722 |
-
},
|
| 723 |
-
{
|
| 724 |
-
"title": "NIH Office of Dietary Supplements",
|
| 725 |
-
"url": "https://ods.od.nih.gov/",
|
| 726 |
-
"domain": "nih.gov",
|
| 727 |
-
"credibility_score": 0.98
|
| 728 |
-
},
|
| 729 |
-
{
|
| 730 |
-
"title": "CDC Nutrition Guidelines",
|
| 731 |
-
"url": "https://www.cdc.gov/nutrition/",
|
| 732 |
-
"domain": "cdc.gov",
|
| 733 |
-
"credibility_score": 0.95
|
| 734 |
}
|
| 735 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 736 |
|
| 737 |
-
# Generate topic-specific responses
|
| 738 |
-
if any(word in query for word in ["vitamin", "mineral"]):
|
| 739 |
-
return {
|
| 740 |
-
"topic": "Vitamins and Minerals",
|
| 741 |
-
"summary": "Vitamins and minerals are essential micronutrients that support various body functions including immune health, energy production, and disease prevention.",
|
| 742 |
-
"key_points": [
|
| 743 |
-
"Get nutrients from whole foods when possible",
|
| 744 |
-
"Fat-soluble vitamins (A,D,E,K) are stored in body fat",
|
| 745 |
-
"Water-soluble vitamins (B,C) need regular replenishment",
|
| 746 |
-
"Consult healthcare providers before taking supplements"
|
| 747 |
-
],
|
| 748 |
-
"sources": trusted_sources
|
| 749 |
-
}
|
| 750 |
-
elif any(word in query for word in ["heart", "cardiovascular"]):
|
| 751 |
-
return {
|
| 752 |
-
"topic": "Heart-Healthy Nutrition",
|
| 753 |
-
"summary": "A heart-healthy diet emphasizes fruits, vegetables, whole grains, lean proteins, and healthy fats while limiting saturated fat, trans fat, and sodium.",
|
| 754 |
-
"key_points": [
|
| 755 |
-
"Limit saturated fat to <10% of daily calories",
|
| 756 |
-
"Choose omega-3 rich foods like fish and walnuts",
|
| 757 |
-
"Eat 5-9 servings of fruits and vegetables daily",
|
| 758 |
-
"Limit sodium to 2,300mg per day (1,500mg if at risk)"
|
| 759 |
-
],
|
| 760 |
-
"sources": trusted_sources
|
| 761 |
-
}
|
| 762 |
-
elif any(word in query for word in ["diabetes", "blood sugar"]):
|
| 763 |
-
return {
|
| 764 |
-
"topic": "Diabetes Nutrition Management",
|
| 765 |
-
"summary": "Managing diabetes involves choosing foods that help maintain stable blood sugar levels through balanced meals with appropriate carbohydrates, protein, and healthy fats.",
|
| 766 |
-
"key_points": [
|
| 767 |
-
"Monitor carbohydrate intake and choose complex carbs",
|
| 768 |
-
"Include protein and healthy fats with meals",
|
| 769 |
-
"Eat at consistent times to help manage blood sugar",
|
| 770 |
-
"Stay hydrated and limit sugary beverages"
|
| 771 |
-
],
|
| 772 |
-
"sources": trusted_sources
|
| 773 |
-
}
|
| 774 |
-
elif any(word in query for word in ["weight", "lose", "management"]):
|
| 775 |
return {
|
| 776 |
-
"
|
| 777 |
-
"
|
| 778 |
-
"
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
],
|
| 784 |
-
"sources": trusted_sources
|
| 785 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
else:
|
| 787 |
-
|
| 788 |
-
"
|
| 789 |
-
"
|
| 790 |
-
"
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 798 |
|
| 799 |
# Load model on startup
|
| 800 |
@app.on_event("startup")
|
|
|
|
| 16 |
import requests
|
| 17 |
import asyncio
|
| 18 |
import aiohttp
|
| 19 |
+
from bs4 import BeautifulSoup
|
| 20 |
+
import re
|
| 21 |
+
from urllib.parse import urljoin, urlparse
|
| 22 |
+
import time
|
| 23 |
|
| 24 |
# Initialize FastAPI app
|
| 25 |
app = FastAPI(
|
|
|
|
| 660 |
try:
|
| 661 |
query = request.query.lower().strip()
|
| 662 |
|
| 663 |
+
# Generate nutrition response using intelligent web scraping
|
| 664 |
+
nutrition_info = await generate_intelligent_nutrition_response(query)
|
| 665 |
|
| 666 |
return NutritionResponse(
|
| 667 |
status="success",
|
|
|
|
| 711 |
error=str(e)
|
| 712 |
)
|
| 713 |
|
| 714 |
+
# Web scraping and content extraction
|
| 715 |
+
class WebScraper:
|
| 716 |
+
def __init__(self):
|
| 717 |
+
self.headers = {
|
| 718 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
}
|
| 720 |
+
self.cache = {} # Simple in-memory cache
|
| 721 |
+
self.cache_duration = 3600 # 1 hour cache
|
| 722 |
+
|
| 723 |
+
async def scrape_url(self, url: str) -> dict:
|
| 724 |
+
"""Scrape content from a single URL"""
|
| 725 |
+
try:
|
| 726 |
+
# Check cache first
|
| 727 |
+
cache_key = url
|
| 728 |
+
if cache_key in self.cache:
|
| 729 |
+
cached_data, timestamp = self.cache[cache_key]
|
| 730 |
+
if time.time() - timestamp < self.cache_duration:
|
| 731 |
+
return cached_data
|
| 732 |
+
|
| 733 |
+
async with aiohttp.ClientSession() as session:
|
| 734 |
+
async with session.get(url, headers=self.headers, timeout=10) as response:
|
| 735 |
+
if response.status == 200:
|
| 736 |
+
html = await response.text()
|
| 737 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 738 |
+
|
| 739 |
+
# Extract meaningful content
|
| 740 |
+
content = self.extract_content(soup, url)
|
| 741 |
+
|
| 742 |
+
# Cache the result
|
| 743 |
+
self.cache[cache_key] = (content, time.time())
|
| 744 |
+
|
| 745 |
+
return content
|
| 746 |
+
else:
|
| 747 |
+
return {"error": f"HTTP {response.status}"}
|
| 748 |
+
|
| 749 |
+
except Exception as e:
|
| 750 |
+
return {"error": str(e)}
|
| 751 |
+
|
| 752 |
+
def extract_content(self, soup: BeautifulSoup, url: str) -> dict:
|
| 753 |
+
"""Extract meaningful content from BeautifulSoup object"""
|
| 754 |
+
|
| 755 |
+
# Remove script and style elements
|
| 756 |
+
for script in soup(["script", "style", "nav", "footer", "header"]):
|
| 757 |
+
script.decompose()
|
| 758 |
+
|
| 759 |
+
# Try to find the main content area
|
| 760 |
+
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|article'))
|
| 761 |
+
|
| 762 |
+
if not main_content:
|
| 763 |
+
main_content = soup.find('body')
|
| 764 |
+
|
| 765 |
+
# Extract title
|
| 766 |
+
title = ""
|
| 767 |
+
title_tag = soup.find('title')
|
| 768 |
+
if title_tag:
|
| 769 |
+
title = title_tag.get_text().strip()
|
| 770 |
+
|
| 771 |
+
# Extract headings and paragraphs
|
| 772 |
+
headings = []
|
| 773 |
+
paragraphs = []
|
| 774 |
+
|
| 775 |
+
if main_content:
|
| 776 |
+
# Get headings (h1, h2, h3)
|
| 777 |
+
for heading in main_content.find_all(['h1', 'h2', 'h3']):
|
| 778 |
+
heading_text = heading.get_text().strip()
|
| 779 |
+
if heading_text and len(heading_text) < 200:
|
| 780 |
+
headings.append(heading_text)
|
| 781 |
+
|
| 782 |
+
# Get paragraphs
|
| 783 |
+
for p in main_content.find_all('p'):
|
| 784 |
+
p_text = p.get_text().strip()
|
| 785 |
+
if p_text and len(p_text) > 50: # Filter out short paragraphs
|
| 786 |
+
paragraphs.append(p_text)
|
| 787 |
+
|
| 788 |
+
# Extract lists (ul, ol)
|
| 789 |
+
lists = []
|
| 790 |
+
if main_content:
|
| 791 |
+
for ul in main_content.find_all(['ul', 'ol']):
|
| 792 |
+
list_items = []
|
| 793 |
+
for li in ul.find_all('li'):
|
| 794 |
+
li_text = li.get_text().strip()
|
| 795 |
+
if li_text and len(li_text) < 300:
|
| 796 |
+
list_items.append(li_text)
|
| 797 |
+
if list_items:
|
| 798 |
+
lists.append(list_items)
|
| 799 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
return {
|
| 801 |
+
"title": title,
|
| 802 |
+
"url": url,
|
| 803 |
+
"domain": urlparse(url).netloc,
|
| 804 |
+
"headings": headings[:10], # Limit to first 10 headings
|
| 805 |
+
"paragraphs": paragraphs[:15], # Limit to first 15 paragraphs
|
| 806 |
+
"lists": lists[:5], # Limit to first 5 lists
|
| 807 |
+
"scraped_at": time.time()
|
|
|
|
|
|
|
| 808 |
}
|
| 809 |
+
|
| 810 |
+
async def scrape_multiple_urls(self, urls: list) -> list:
|
| 811 |
+
"""Scrape multiple URLs concurrently"""
|
| 812 |
+
tasks = [self.scrape_url(url) for url in urls]
|
| 813 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 814 |
+
|
| 815 |
+
# Filter out exceptions and errors
|
| 816 |
+
valid_results = []
|
| 817 |
+
for result in results:
|
| 818 |
+
if isinstance(result, dict) and "error" not in result:
|
| 819 |
+
valid_results.append(result)
|
| 820 |
+
|
| 821 |
+
return valid_results
|
| 822 |
+
|
| 823 |
+
# Initialize scraper
|
| 824 |
+
web_scraper = WebScraper()
|
| 825 |
+
|
| 826 |
+
def get_trusted_urls_for_query(query: str) -> list:
|
| 827 |
+
"""Get relevant trusted URLs based on the query"""
|
| 828 |
+
query_lower = query.lower()
|
| 829 |
+
|
| 830 |
+
urls = []
|
| 831 |
+
|
| 832 |
+
# Weight loss / management
|
| 833 |
+
if any(phrase in query_lower for phrase in ["lose weight", "weight loss", "weight management"]):
|
| 834 |
+
urls.extend([
|
| 835 |
+
"https://www.cdc.gov/healthyweight/losing_weight/index.html",
|
| 836 |
+
"https://www.niddk.nih.gov/health-information/weight-management/choosing-a-safe-successful-weight-loss-program",
|
| 837 |
+
"https://www.mayoclinic.org/healthy-lifestyle/weight-loss/basics/weightloss-basics/hlv-20049483"
|
| 838 |
+
])
|
| 839 |
+
|
| 840 |
+
# Heart health
|
| 841 |
+
elif any(phrase in query_lower for phrase in ["heart", "cardiovascular", "cholesterol"]):
|
| 842 |
+
urls.extend([
|
| 843 |
+
"https://www.heart.org/en/healthy-living/healthy-eating/eat-smart/nutrition-basics",
|
| 844 |
+
"https://www.nhlbi.nih.gov/education/dash-eating-plan",
|
| 845 |
+
"https://www.mayoclinic.org/diseases-conditions/heart-disease/in-depth/heart-healthy-diet/art-20047702"
|
| 846 |
+
])
|
| 847 |
+
|
| 848 |
+
# Diabetes
|
| 849 |
+
elif any(phrase in query_lower for phrase in ["diabetes", "blood sugar"]):
|
| 850 |
+
urls.extend([
|
| 851 |
+
"https://www.cdc.gov/diabetes/managing/eat-well.html",
|
| 852 |
+
"https://www.niddk.nih.gov/health-information/diabetes/overview/diet-eating-physical-activity",
|
| 853 |
+
"https://diabetes.org/food-nutrition"
|
| 854 |
+
])
|
| 855 |
+
|
| 856 |
+
# Vitamins and supplements
|
| 857 |
+
elif any(word in query_lower for word in ["vitamin", "supplement", "mineral"]):
|
| 858 |
+
urls.extend([
|
| 859 |
+
"https://ods.od.nih.gov/factsheets/list-all/",
|
| 860 |
+
"https://www.nutrition.gov/topics/dietary-supplements",
|
| 861 |
+
"https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/supplements/art-20044894"
|
| 862 |
+
])
|
| 863 |
+
|
| 864 |
+
# General nutrition
|
| 865 |
else:
|
| 866 |
+
urls.extend([
|
| 867 |
+
"https://www.nutrition.gov/topics/basic-nutrition",
|
| 868 |
+
"https://www.cdc.gov/nutrition/guidelines.html",
|
| 869 |
+
"https://www.choosemyplate.gov/"
|
| 870 |
+
])
|
| 871 |
+
|
| 872 |
+
return urls[:3] # Limit to 3 URLs to avoid overwhelming the system
|
| 873 |
+
|
| 874 |
+
async def generate_intelligent_nutrition_response(query: str) -> dict:
|
| 875 |
+
"""Generate nutrition response by scraping and summarizing trusted sources"""
|
| 876 |
+
|
| 877 |
+
# Get relevant URLs
|
| 878 |
+
trusted_urls = get_trusted_urls_for_query(query)
|
| 879 |
+
|
| 880 |
+
# Scrape the URLs
|
| 881 |
+
scraped_data = await web_scraper.scrape_multiple_urls(trusted_urls)
|
| 882 |
+
|
| 883 |
+
if not scraped_data:
|
| 884 |
+
# Fallback to static response if scraping fails
|
| 885 |
+
return generate_static_nutrition_response(query)
|
| 886 |
+
|
| 887 |
+
# Combine and summarize the scraped content
|
| 888 |
+
combined_content = ""
|
| 889 |
+
sources = []
|
| 890 |
+
|
| 891 |
+
for data in scraped_data:
|
| 892 |
+
# Add to sources
|
| 893 |
+
sources.append({
|
| 894 |
+
"title": data["title"],
|
| 895 |
+
"url": data["url"],
|
| 896 |
+
"domain": data["domain"],
|
| 897 |
+
"credibility_score": get_credibility_score(data["domain"])
|
| 898 |
+
})
|
| 899 |
+
|
| 900 |
+
# Combine content for summarization
|
| 901 |
+
content_parts = []
|
| 902 |
+
content_parts.extend(data["headings"])
|
| 903 |
+
content_parts.extend(data["paragraphs"][:5]) # First 5 paragraphs
|
| 904 |
+
|
| 905 |
+
# Add list items
|
| 906 |
+
for list_items in data["lists"]:
|
| 907 |
+
content_parts.extend(list_items[:3]) # First 3 items from each list
|
| 908 |
+
|
| 909 |
+
combined_content += " ".join(content_parts) + " "
|
| 910 |
+
|
| 911 |
+
# Generate summary using the scraped content
|
| 912 |
+
summary, key_points = summarize_nutrition_content(combined_content, query)
|
| 913 |
+
|
| 914 |
+
# Determine topic from query
|
| 915 |
+
topic = determine_nutrition_topic(query)
|
| 916 |
+
|
| 917 |
+
return {
|
| 918 |
+
"topic": topic,
|
| 919 |
+
"summary": summary,
|
| 920 |
+
"key_points": key_points,
|
| 921 |
+
"sources": sources,
|
| 922 |
+
"scraped_from": len(scraped_data),
|
| 923 |
+
"query_analyzed": query
|
| 924 |
+
}
|
| 925 |
+
|
| 926 |
+
def get_credibility_score(domain: str) -> float:
|
| 927 |
+
"""Get credibility score for a domain"""
|
| 928 |
+
scores = {
|
| 929 |
+
"cdc.gov": 0.95,
|
| 930 |
+
"nih.gov": 0.98,
|
| 931 |
+
"niddk.nih.gov": 0.98,
|
| 932 |
+
"nutrition.gov": 0.95,
|
| 933 |
+
"mayoclinic.org": 0.90,
|
| 934 |
+
"heart.org": 0.92,
|
| 935 |
+
"diabetes.org": 0.93,
|
| 936 |
+
"choosemyplate.gov": 0.90,
|
| 937 |
+
"nhlbi.nih.gov": 0.95,
|
| 938 |
+
"ods.od.nih.gov": 0.98
|
| 939 |
+
}
|
| 940 |
+
return scores.get(domain, 0.75)
|
| 941 |
+
|
| 942 |
+
def summarize_nutrition_content(content: str, query: str) -> tuple:
|
| 943 |
+
"""Summarize nutrition content and extract key points"""
|
| 944 |
+
|
| 945 |
+
# Clean the content
|
| 946 |
+
content = re.sub(r'\s+', ' ', content) # Remove extra whitespace
|
| 947 |
+
content = content[:3000] # Limit content length
|
| 948 |
+
|
| 949 |
+
# Use simple summarization for now (could use LLM later)
|
| 950 |
+
sentences = content.split('.')
|
| 951 |
+
|
| 952 |
+
# Find most relevant sentences based on query keywords
|
| 953 |
+
query_words = query.lower().split()
|
| 954 |
+
relevant_sentences = []
|
| 955 |
+
|
| 956 |
+
for sentence in sentences:
|
| 957 |
+
sentence = sentence.strip()
|
| 958 |
+
if len(sentence) > 20:
|
| 959 |
+
# Score sentence based on query word matches
|
| 960 |
+
score = sum(1 for word in query_words if word in sentence.lower())
|
| 961 |
+
if score > 0:
|
| 962 |
+
relevant_sentences.append((score, sentence))
|
| 963 |
+
|
| 964 |
+
# Sort by relevance and take top sentences
|
| 965 |
+
relevant_sentences.sort(key=lambda x: x[0], reverse=True)
|
| 966 |
+
|
| 967 |
+
# Create summary from top 3 relevant sentences
|
| 968 |
+
summary_sentences = [sent[1] for sent in relevant_sentences[:3]]
|
| 969 |
+
summary = ". ".join(summary_sentences)
|
| 970 |
+
|
| 971 |
+
if not summary:
|
| 972 |
+
summary = "Evidence-based nutrition information from trusted health organizations."
|
| 973 |
+
|
| 974 |
+
# Extract key points (look for list-like content)
|
| 975 |
+
key_points = []
|
| 976 |
+
for sentence in sentences:
|
| 977 |
+
sentence = sentence.strip()
|
| 978 |
+
if any(starter in sentence.lower() for starter in ["eat ", "choose ", "limit ", "include ", "avoid ", "consume "]):
|
| 979 |
+
if len(sentence) > 20 and len(sentence) < 150:
|
| 980 |
+
key_points.append(sentence.capitalize())
|
| 981 |
+
|
| 982 |
+
# Ensure we have at least 4 key points
|
| 983 |
+
if len(key_points) < 4:
|
| 984 |
+
key_points.extend([
|
| 985 |
+
"Eat a variety of nutrient-dense foods from all food groups",
|
| 986 |
+
"Practice portion control and mindful eating",
|
| 987 |
+
"Stay hydrated with water as your primary beverage",
|
| 988 |
+
"Consult healthcare professionals for personalized advice"
|
| 989 |
+
])
|
| 990 |
+
|
| 991 |
+
return summary[:500], key_points[:6] # Limit summary and key points
|
| 992 |
+
|
| 993 |
+
def determine_nutrition_topic(query: str) -> str:
|
| 994 |
+
"""Determine the main nutrition topic from the query"""
|
| 995 |
+
query_lower = query.lower()
|
| 996 |
+
|
| 997 |
+
if any(phrase in query_lower for phrase in ["lose weight", "weight loss"]):
|
| 998 |
+
return "Weight Loss Nutrition"
|
| 999 |
+
elif any(phrase in query_lower for phrase in ["gain weight", "build muscle"]):
|
| 1000 |
+
return "Healthy Weight Gain"
|
| 1001 |
+
elif any(phrase in query_lower for phrase in ["heart", "cardiovascular"]):
|
| 1002 |
+
return "Heart-Healthy Nutrition"
|
| 1003 |
+
elif any(phrase in query_lower for phrase in ["diabetes", "blood sugar"]):
|
| 1004 |
+
return "Diabetes Nutrition Management"
|
| 1005 |
+
elif any(word in query_lower for word in ["vitamin", "supplement"]):
|
| 1006 |
+
return "Vitamins and Supplements"
|
| 1007 |
+
else:
|
| 1008 |
+
return "General Nutrition Guidelines"
|
| 1009 |
+
|
| 1010 |
+
def generate_static_nutrition_response(query: str) -> dict:
|
| 1011 |
+
"""Fallback static response when scraping fails"""
|
| 1012 |
+
# Your existing static response logic here
|
| 1013 |
+
return {
|
| 1014 |
+
"topic": "General Nutrition",
|
| 1015 |
+
"summary": "Unable to fetch current information. Please try again later.",
|
| 1016 |
+
"key_points": ["Consult healthcare professionals for nutrition advice"],
|
| 1017 |
+
"sources": []
|
| 1018 |
+
}
|
| 1019 |
+
|
| 1020 |
+
def generate_nutrition_response(query: str) -> dict:
|
| 1021 |
+
"""
|
| 1022 |
+
Legacy sync wrapper for async function
|
| 1023 |
+
"""
|
| 1024 |
+
import asyncio
|
| 1025 |
+
loop = asyncio.new_event_loop()
|
| 1026 |
+
asyncio.set_event_loop(loop)
|
| 1027 |
+
try:
|
| 1028 |
+
return loop.run_until_complete(generate_intelligent_nutrition_response(query))
|
| 1029 |
+
finally:
|
| 1030 |
+
loop.close()
|
| 1031 |
|
| 1032 |
# Load model on startup
|
| 1033 |
@app.on_event("startup")
|
|
@@ -11,4 +11,7 @@ scikit-learn>=1.3.0
|
|
| 11 |
numpy>=1.24.0
|
| 12 |
datasets>=2.19.0
|
| 13 |
aiohttp>=3.8.0
|
| 14 |
-
requests>=2.25.0
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
numpy>=1.24.0
|
| 12 |
datasets>=2.19.0
|
| 13 |
aiohttp>=3.8.0
|
| 14 |
+
requests>=2.25.0
|
| 15 |
+
beautifulsoup4>=4.12.0
|
| 16 |
+
lxml>=4.9.0
|
| 17 |
+
html5lib>=1.1
|