Spaces:

XMMR12
/

Medical-Plants-Data-Extractor

Runtime error

App Files Files Community

XMMR12 commited on Jun 7, 2025

Commit

8cca16d

verified ·

1 Parent(s): c10c57f

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -339

app.py CHANGED Viewed

@@ -1,339 +1,154 @@
-import os
-import gradio as gr
-from duckduckgo_search import DDGS
-import sqlite3
-import json
-import requests
-from typing import List, Dict, Optional
-import time
-from bs4 import BeautifulSoup
-specialtoken=os.getenv("SPECIALTOKEN")
-#plants = ["Echinacea", "Ginkgo biloba", "Turmeric"]
-#Unused
-def GET_full_plant_information(plant_name:str):
-    """ """
-    query = f"{plant_name} plant medicinal uses scientific information site:.edu OR site:.gov OR site:.org"
-    search_results = DDGS().text(keywords=query, max_results=3)
-    content=""
-    for result in search_results:
-        content+=fetch_page_content(result['href']);
-    PROMPT_TEMPLATE = f"""
-Extract plant information from the following content in JSON format with these keys:
-["Name", "Scientific Name", "Alternate Names", "Description", "Plant Family",
-"Origin", "Growth Habitat", "Active Components", "Treatable Conditions",
-"Preparation Methods", "Dosage", "Duration", "Contraindications", "Side Effects",
-"Interactions", "Part Used", "Harvesting Time", "Storage Tips", "Images",
-"Related Videos", "Sources"]
-Plant: {plant_name}
-Content:
-{content}
-Output ONLY valid JSON with the specified keys. Use empty strings for missing information."""
-    response = requests.get(f"{specialtoken}/{PROMPT_TEMPLATE}")
-    return response.json()
-#end Unused
-# Database setup
-DB_NAME = "plants.db"
-def init_db():
-    conn = sqlite3.connect(DB_NAME)
-    cursor = conn.cursor()
-    cursor.execute('''
-        CREATE TABLE IF NOT EXISTS plants (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            name TEXT NOT NULL,
-            scientific_name TEXT,
-            alternate_names TEXT,
-            description TEXT,
-            plant_family TEXT,
-            origin TEXT,
-            growth_habitat TEXT,
-            active_components TEXT,
-            treatable_conditions TEXT,
-            preparation_methods TEXT,
-            dosage TEXT,
-            duration TEXT,
-            contraindications TEXT,
-            side_effects TEXT,
-            interactions TEXT,
-            part_used TEXT,
-            harvesting_time TEXT,
-            storage_tips TEXT,
-            images TEXT,
-            related_videos TEXT,
-            sources TEXT
-        )
-    ''')
-    conn.commit()
-    conn.close()
-def process_with_ai(raw_data: str, plant_name: str) -> Dict:
-    """Process raw plant data through AI to extract structured information"""
-    PROMPT_TEMPLATE = """You are a botanist assistant that extracts and structures information about medicinal plants."""
-    PROMPT = f"""Extract plant information from the following content (if available) in JSON format with these keys:
-["Name", "Scientific Name", "Alternate Names", "Description", "Plant Family",
-"Origin", "Growth Habitat", "Active Components", "Treatable Conditions",
-"Preparation Methods", "Dosage", "Duration", "Contraindications", "Side Effects",
-"Interactions", "Part Used", "Harvesting Time", "Storage Tips", "Images",
-"Related Videos", "Sources"]
-Plant: {plant_name}
-Content:
-{raw_data}
-Output ONLY valid JSON with the specified keys. Use empty strings for missing information."""
-    payload = {
-        "model": "openai",
-        "messages": [
-            {"role": "system", "content": PROMPT_TEMPLATE},
-            #{"role": "user", "content": f"Process this information about {plant_name}:\n{raw_data}"}
-            {"role": "user", "content": PROMPT}
-        ],
-        #"response_format": { "type": "json_object" }
-    }
-    try:
-        resp = requests.post(
-            specialtoken,
-            json=payload,
-            headers={"Content-Type": "application/json"},
-            timeout=30
-        )
-        resp.raise_for_status()
-        # Improved response handling
-        response_json = resp.json()
-        if "choices" not in response_json or not response_json["choices"]:
-            raise ValueError("Invalid API response format")
-        ai_response = response_json["choices"]["message"]["content"]
-        print(ai_response)
-        try:
-            return json.loads(ai_response)
-        except json.JSONDecodeError:
-            # If the response isn't valid JSON, create a minimal response
-            return {
-                "Name": plant_name,
-                "Description": raw_data,
-                "Sources": "Search results",
-                "Error": "AI response format invalid"
-            }
-    except Exception as e:
-        print(f"AI processing error for {plant_name}: {str(e)}")
-        return {
-            "Name": plant_name,
-            "Description": raw_data,
-            "Sources": "Search results",
-            "Error": f"Processing failed: {str(e)}"
-        }
-def fetch_plant_data(plant_name: str) -> Optional[Dict]:
-    """Fetch and process plant data
-    Args:
-        plant_name: The plant name to search on.
-    Returns:
-        The top 3 results with title, link and body, or an error message if no results or API issues."""
-    try:
-        # 1. Get raw data from internet
-        query = f"{plant_name} plant medicinal uses scientific information site:.edu OR site:.gov OR site:.org"
-        search_results = DDGS().text(keywords=query, max_results=3)
-        if not search_results:
-            return {
-                "Name": plant_name,
-                "Error": "No search results found"
-            }
-        #raw_data = "\n".join([f"Source: {r['href']}\nContent: {r['body']}" for r in search_results])
-        raw_data=""
-        for result in search_results:
-            raw_data+=fetch_page_content(result['href']);
-        # 2. Process with AI
-        processed_data = process_with_ai(raw_data, plant_name)
-        if not processed_data:
-            return {
-                "Name": plant_name,
-                "Error": "AI processing returned empty response"
-            }
-        # 3. Add metadata
-        processed_data["Name"] = plant_name
-        if "Sources" not in processed_data:
-            processed_data["Sources"] = "\n".join([r["href"] for r in search_results])
-        return processed_data
-    except Exception as e:
-        print(f"Error processing {plant_name}: {e}")
-        return {
-            "Name": plant_name,
-            "Error": f"Processing failed: {str(e)}"
-        }
-"""
-Searches DuckDuckGo and returns the first result.
-Args:
-    query: The search query.
-Returns:
-    The first result's title and link, or an error message if no results or API issues.
-"""
-#TODO arrange the logic together with tools
-def fetch_page_content(url: str):
-    """Get webpage content with error handling"""
-    try:
-        response = requests.get(url, timeout=10)
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove unwanted elements
-        for element in soup(['script', 'style', 'header', 'footer', 'nav']):
-            element.decompose()
-        text = soup.get_text(separator='\n', strip=True)
-        for each in ["Page not available","403 Forbidden"]:
-            if each in text:
-                return ""
-        return text[:5000]  # Limit to 5k characters
-    except Exception as e:
-        return f"Error fetching page: {str(e)}"
-def save_to_db(plant_data: Dict) -> bool:
-    """Save processed plant data to database"""
-    try:
-        conn = sqlite3.connect(DB_NAME)
-        cursor = conn.cursor()
-        # Convert arrays to strings if they exist
-        for field in ["Alternate Names", "Active Components", "Treatable Conditions",
-                     "Preparation Methods", "Contraindications", "Side Effects",
-                     "Interactions"]:
-            if field in plant_data:
-                if isinstance(plant_data[field], list):
-                    plant_data[field] = ", ".join(plant_data[field])
-                elif not isinstance(plant_data[field], str):
-                    plant_data[field] = str(plant_data[field])
-        columns = []
-        values = []
-        for key, value in plant_data.items():
-            if key.lower() == "error":  # Skip error field
-                continue
-            columns.append(key.lower().replace(" ", "_"))
-            values.append(str(value) if value else None)
-        columns_str = ", ".join(columns)
-        placeholders = ", ".join(["?"] * len(columns))
-        cursor.execute(
-            f"INSERT INTO plants ({columns_str}) VALUES ({placeholders})",
-            values
-        )
-        conn.commit()
-        conn.close()
-        return True
-    except Exception as e:
-        print(f"Database save error: {e}")
-        return False
-def process_plants(plants_array: List[str]) -> str:
-    """Main processing pipeline"""
-    results = []
-    for plant in plants_array:
-        plant = plant.strip()
-        if not plant:
-            continue
-        print(f"Processing {plant}...")
-        plant_data = fetch_plant_data(plant)
-        if plant_data:
-            save_success = save_to_db(plant_data)
-            plant_data["Database_Save_Success"] = save_success
-            results.append(plant_data)
-        time.sleep(2)  # Rate limiting
-    return json.dumps(results, indent=2) if results else json.dumps({"message": "No valid results found"})
-#For View:
-def get_all_plants() -> List[Dict]:
-    """Retrieve all plants from database"""
-    try:
-        conn = sqlite3.connect(DB_NAME)
-        conn.row_factory = sqlite3.Row
-        cursor = conn.cursor()
-        cursor.execute("SELECT `_rowid_`,* FROM plants ORDER BY `_rowid_` DESC")
-        plants = [dict(row) for row in cursor.fetchall()]
-        conn.close()
-        return plants
-    except Exception as e:
-        print(f"Database retrieval error: {e}")
-        return [{"Error": "Failed to retrieve data from database"}]
-# Initialize database
-init_db()
-# Gradio Interface
-with gr.Blocks(title="AI-Powered Medicinal Plants Database") as app:
-    gr.Markdown("# 🌿 AI-Powered Medicinal Plants Database")
-    with gr.Tab("Fetch & Process Plants"):
-        gr.Markdown("### Enter plant names (comma separated)")
-        with gr.Row():
-            plant_input = gr.Textbox(label="Plant Names",
-                                   placeholder="e.g., Neem, Peppermint, Aloe Vera")
-            fetch_btn = gr.Button("Process Plants", variant="primary")
-        json_output = gr.JSON(label="AI-Processed Results")
-        fetch_btn.click(
-            fn=lambda x: process_plants([p.strip() for p in x.split(",")]),
-            inputs=plant_input,
-            outputs=json_output
-        )
-    with gr.Tab("View Database"):
-        gr.Markdown("### Stored Plant Information")
-        with gr.Row():
-            refresh_btn = gr.Button("Refresh Data", variant="secondary")
-            clear_db = gr.Button("Clear Database", variant="stop")
-        db_table = gr.Dataframe(
-            headers=["id", "name", "scientific_name", "description"],
-            datatype=["number", "str", "str", "str"],
-            col_count=(4, "fixed"),
-            interactive=True
-        )
-        refresh_btn.click(
-            fn=get_all_plants,
-            outputs=db_table
-        )
-        def clear_database():
-            conn = sqlite3.connect(DB_NAME)
-            conn.execute("DELETE FROM plants")
-            conn.commit()
-            conn.close()
-            return []
-        clear_db.click(
-            fn=clear_database,
-            outputs=db_table
-        )
-if __name__ == "__main__":
-    app.launch(debug=True, share=False)

+import os
+import gradio as gr
+from duckduckgo_search import DDGS
+import sqlite3
+import json
+import requests
+from typing import List, Dict, Optional
+import time
+from bs4 import BeautifulSoup
+specialtoken=os.getenv("SPECIALTOKEN")
+#plants=['Turmeric', 'Aloe Vera', 'Neem', 'Tulsi', 'Ashwagandha', 'Ginger', 'Basil', 'Peppermint', 'Lavender', 'Eucalyptus', 'Chamomile', 'Sandalwood', 'Giloy', 'Haritaki', 'Brahmi', 'Gotu Kola', 'Holy Basil', 'Fenugreek', 'Licorice', 'Fennel', 'Cinnamon', 'Clove', 'Black Pepper', 'Cardamom', 'Neem', 'Indian Gooseberry', 'Saffron', 'Thyme', 'Valerian', 'Marigold', 'Ginseng', 'Dandelion', 'Hibiscus', 'Milk Thistle', 'Magnolia', "St. John's Wort", 'Yarrow', 'Calendula', 'Coriander', 'Senna', 'Echinacea', 'Moringa', 'Plantain', 'Amla', 'Shatavari', 'Peppermint', 'Chamomile', 'Gotu Kola', 'Ashoka', 'Arnica', 'Burdock Root', "Cat's Claw", "Devil's Claw", 'Elderberry', 'Feverfew', 'Ginkgo Biloba', 'Goldenseal', 'Hawthorn', 'Kava', 'Lemon Balm', 'Marshmallow Root', 'Nettle', 'Olive Leaf', 'Passionflower', 'Red Clover', 'Reishi Mushroom', 'Rhodiola', 'Sage', 'Saw Palmetto', 'Slippery Elm', 'Stinging Nettle', 'Witch Hazel', 'Yellow Dock', 'Ashitaba', 'Bael', 'Bacopa', 'Cumin', 'Guduchi', 'Jamun', 'Jatamansi', 'Karela', 'Gudmar', 'Schisandra', 'Baikal Skullcap', 'Mullein', 'Chrysanthemum', 'Catuaba', 'Dong Quai', 'Jiaogulan', 'Muira Puama', 'Catnip', 'Olive']
+plants = ["Echinacea", "Ginkgo biloba", "Turmeric"]
+PROMPT_TEMPLATE = """
+Extract plant information from the following content in JSON format with these keys:
+["Name", "Scientific Name", "Alternate Names", "Description", "Plant Family",
+"Origin", "Growth Habitat", "Active Components", "Treatable Conditions",
+"Preparation Methods", "Dosage", "Duration", "Contraindications", "Side Effects",
+"Interactions", "Part Used", "Harvesting Time", "Storage Tips", "Images",
+"Related Videos", "Sources"]
+Plant: {plant_name}
+Content:
+{content}
+Output ONLY valid JSON with the specified keys. Use empty strings for missing information.
+    """
+def fetch_page_content(url: str):
+    """Get webpage content with error handling"""
+    try:
+        response = requests.get(url, timeout=10)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove unwanted elements
+        for element in soup(['script', 'style', 'header', 'footer', 'nav']):
+            element.decompose()
+        text = soup.get_text(separator='\n', strip=True)
+        for each in ["Page not available","403 Forbidden"]:
+            if each in text:
+                return ""
+        return text[:5000]  # Limit to 5k characters
+    except Exception as e:
+        return f"Error fetching page: {str(e)}"
+def search_full_plant_information(plant_name:str):
+    """ """
+    query = f"{plant_name} plant medicinal uses scientific information site:.edu OR site:.gov OR site:.org"
+    counter=0
+    while True:
+        counter+=1
+        print(counter)
+        try:
+            search_results = DDGS().text(keywords=query, max_results=3)
+            break
+        except Exception as e :
+            time.sleep(2)
+            pass
+    content=""
+    for result in search_results:
+        content+=fetch_page_content(result['href']);
+    prompt = PROMPT_TEMPLATE.format(plant_name=plant_name, content=content)
+    response = requests.get(f"{specialtoken}/{prompt}")
+    print (response)
+    return response.json()
+DB_NAME="plants.db"
+def save_to_db(plant_data: Dict) -> bool:
+    """Save processed plant data to database"""
+    try:
+        conn = sqlite3.connect(DB_NAME)
+        cursor = conn.cursor()
+        # Convert arrays to strings if they exist
+        for field in ["Alternate Names", "Active Components", "Treatable Conditions",
+                     "Preparation Methods", "Contraindications", "Side Effects",
+                     "Interactions"]:
+            if field in plant_data:
+                if isinstance(plant_data[field], list):
+                    plant_data[field] = ", ".join(plant_data[field])
+                elif not isinstance(plant_data[field], str):
+                    plant_data[field] = str(plant_data[field])
+        columns = []
+        values = []
+        for key, value in plant_data.items():
+            if key.lower() == "error":  # Skip error field
+                continue
+            columns.append(key.lower().replace(" ", "_"))
+            values.append(str(value) if value else None)
+        columns_str = ", ".join(columns)
+        placeholders = ", ".join(["?"] * len(columns))
+        cursor.execute(
+            f"INSERT INTO plants ({columns_str}) VALUES ({placeholders})",
+            values
+        )
+        conn.commit()
+        conn.close()
+        return True
+    except Exception as e:
+        print(f"Database save error: {e}")
+        return False
+def process_plants(plants_array: List[str]) -> str:
+    """Main processing pipeline"""
+    results = []
+    for plant in plants_array:
+        plant = plant.strip()
+        if not plant:
+            continue
+        print(f"Processing {plant}...")
+        plant_data = search_full_plant_information(plant)
+        if plant_data:
+            save_success = save_to_db(plant_data)
+            plant_data["Database_Save_Success"] = save_success
+            results.append(plant_data)
+        time.sleep(2)  # Rate limiting
+    return json.dumps(results, indent=2) if results else json.dumps({"message": "No valid results found"})
+#use it here :
+#process_plants(plants)
+#or use interface:
+with gr.Blocks(title="AI-Powered Medicinal Plants Database") as app:
+    gr.Markdown("# 🌿 AI-Powered Medicinal Plants Database")
+    with gr.Tab("Fetch & Process Plants"):
+        gr.Markdown("### Enter plant names (comma separated)")
+        with gr.Row():
+            plant_input = gr.Textbox(label="Plant Names",
+                                   placeholder="e.g., Neem, Peppermint, Aloe Vera")
+            fetch_btn = gr.Button("Process Plants", variant="primary")
+        json_output = gr.JSON(label="AI-Processed Results")
+        fetch_btn.click(
+            fn=process_plants(plants),
+            #fn=lambda x: process_plants([p.strip() for p in x.split(",")]),
+            #inputs=plant_input,
+            outputs=json_output
+        )
+if __name__ == "__main__":
+    app.launch(debug=True, share=False)