Spaces:

Agents-MCP-Hackathon
/

MailQuery

Runtime error

App Files Files Community

Arun Raghav commited on Jun 8, 2025

Commit

687083b

1 Parent(s): b0ee7e5

scrape fixes

Browse files

Files changed (6) hide show

.gitignore +1 -0
agentic_implementation/email_scraper.py +205 -3
agentic_implementation/name_mapping.json +2 -1
agentic_implementation/re_act.py +6 -22
agentic_implementation/schemas.py +1 -2
agentic_implementation/tools.py +96 -45

.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
 .env
 myenv/
 __pycache__/
 *.py[cod]

 .env
 myenv/
+venv/
 __pycache__/
 *.py[cod]

agentic_implementation/email_scraper.py CHANGED Viewed

@@ -19,17 +19,102 @@ load_dotenv()
 # Email credentials
 APP_PASSWORD = os.getenv("APP_PASSWORD")
 EMAIL_ID = os.getenv("EMAIL_ID")
 EMAIL_DB_FILE = "email_db.json"
 def _imap_connect():
     """Connect to Gmail IMAP server"""
     try:
         mail = imaplib.IMAP4_SSL("imap.gmail.com")
-        mail.login(EMAIL_ID, APP_PASSWORD)
-        mail.select('"[Gmail]/All Mail"')
         return mail
     except Exception as e:
-        print(f"IMAP connection failed: {e}")
         raise
 def _email_to_clean_text(msg):
@@ -249,6 +334,123 @@ def scrape_emails_from_sender(sender_email: str, start_date: str, end_date: str)
         print(f"Email scraping failed: {e}")
         raise
 # Test the scraper
 if __name__ == "__main__":
     # Test scraping

 # Email credentials
 APP_PASSWORD = os.getenv("APP_PASSWORD")
 EMAIL_ID = os.getenv("EMAIL_ID")
+print("EMAIL_ID: ", EMAIL_ID)
 EMAIL_DB_FILE = "email_db.json"
+def validate_email_setup():
+    """Validate email setup and credentials"""
+    print("=== Email Setup Validation ===")
+    # Check .env file existence
+    env_file_exists = os.path.exists('.env')
+    print(f".env file exists: {'✅ Yes' if env_file_exists else '❌ No'}")
+    if not env_file_exists:
+        print("❌ No .env file found! Create one with:")
+        print("   EMAIL_ID=your_email@gmail.com")
+        print("   APP_PASSWORD=your_16_char_app_password")
+        print("   OPENAI_API_KEY=your_openai_key")
+        return False
+    # Check environment variables
+    issues = []
+    if not EMAIL_ID:
+        issues.append("EMAIL_ID not set or empty")
+    elif '@' not in EMAIL_ID:
+        issues.append("EMAIL_ID doesn't look like an email address")
+    elif not EMAIL_ID.endswith('@gmail.com'):
+        issues.append("EMAIL_ID should be a Gmail address (@gmail.com)")
+    if not APP_PASSWORD:
+        issues.append("APP_PASSWORD not set or empty")
+    elif len(APP_PASSWORD) != 16:
+        issues.append(f"APP_PASSWORD should be 16 characters, got {len(APP_PASSWORD)}")
+    elif ' ' in APP_PASSWORD:
+        issues.append("APP_PASSWORD should not contain spaces (remove spaces from app password)")
+    if not os.getenv("OPENAI_API_KEY"):
+        issues.append("OPENAI_API_KEY not set (needed for query processing)")
+    if issues:
+        print("❌ Issues found:")
+        for issue in issues:
+            print(f"   - {issue}")
+        return False
+    else:
+        print("✅ All credentials look good!")
+        return True
 def _imap_connect():
     """Connect to Gmail IMAP server"""
+    print("=== IMAP Connection Debug ===")
+    # Check if environment variables are loaded
+    print(f"EMAIL_ID loaded: {'✅ Yes' if EMAIL_ID else '❌ No (None/Empty)'}")
+    print(f"APP_PASSWORD loaded: {'✅ Yes' if APP_PASSWORD else '❌ No (None/Empty)'}")
+    if EMAIL_ID:
+        print(f"Email ID: {EMAIL_ID[:5]}...@{EMAIL_ID.split('@')[1] if '@' in EMAIL_ID else 'INVALID'}")
+    if APP_PASSWORD:
+        print(f"App Password length: {len(APP_PASSWORD)} characters")
+        print(f"App Password format: {'✅ Looks correct (16 chars)' if len(APP_PASSWORD) == 16 else f'❌ Expected 16 chars, got {len(APP_PASSWORD)}'}")
+    if not EMAIL_ID or not APP_PASSWORD:
+        error_msg = "Missing credentials in environment variables!"
+        print(f"❌ {error_msg}")
+        raise Exception(error_msg)
     try:
+        print("🔄 Attempting IMAP SSL connection to imap.gmail.com:993...")
         mail = imaplib.IMAP4_SSL("imap.gmail.com")
+        print("✅ SSL connection established")
+        print("🔄 Attempting login...")
+        result = mail.login(EMAIL_ID, APP_PASSWORD)
+        print(f"✅ Login successful: {result}")
+        print("🔄 Selecting mailbox: [Gmail]/All Mail...")
+        result = mail.select('"[Gmail]/All Mail"')
+        print(f"✅ Mailbox selected: {result}")
+        print("=== IMAP Connection Successful ===")
         return mail
+    except imaplib.IMAP4.error as e:
+        print(f"❌ IMAP Error: {e}")
+        print("💡 Possible causes:")
+        print("   - App Password is incorrect or expired")
+        print("   - 2FA not enabled on Gmail account")
+        print("   - IMAP access not enabled in Gmail settings")
+        print("   - Gmail account locked or requires security verification")
+        raise
     except Exception as e:
+        print(f"❌ Connection Error: {e}")
+        print("💡 Possible causes:")
+        print("   - Network connectivity issues")
+        print("   - Gmail IMAP server temporarily unavailable")
+        print("   - Firewall blocking IMAP port 993")
         raise
 def _email_to_clean_text(msg):
         print(f"Email scraping failed: {e}")
         raise
+def scrape_emails_by_text_search(keyword: str, start_date: str, end_date: str) -> List[Dict]:
+    """
+    Scrape emails containing a specific keyword (like company name) within date range.
+    Uses IMAP text search to find emails from senders containing the keyword.
+    """
+    print(f"Searching emails containing '{keyword}' between {start_date} and {end_date}")
+    # Validate setup first
+    if not validate_email_setup():
+        raise Exception("Email setup validation failed. Please check your .env file and credentials.")
+    try:
+        mail = _imap_connect()
+        # Prepare IMAP search criteria with text search
+        start_imap = _date_to_imap_format(start_date)
+        # Add one day to end_date for BEFORE criteria (IMAP BEFORE is exclusive)
+        end_dt = datetime.strptime(end_date, "%d-%b-%Y") + timedelta(days=1)
+        end_imap = end_dt.strftime("%d-%b-%Y")
+        # Search for emails containing the keyword in FROM field or SUBJECT or BODY
+        # We'll search multiple criteria and combine results
+        search_criteria_list = [
+            f'FROM "{keyword}" SINCE "{start_imap}" BEFORE "{end_imap}"',
+            f'SUBJECT "{keyword}" SINCE "{start_imap}" BEFORE "{end_imap}"',
+            f'BODY "{keyword}" SINCE "{start_imap}" BEFORE "{end_imap}"'
+        ]
+        all_email_ids = set()
+        # Search with multiple criteria to catch emails containing the keyword
+        for search_criteria in search_criteria_list:
+            try:
+                print(f"IMAP search: {search_criteria}")
+                status, data = mail.search(None, search_criteria)
+                if status == 'OK' and data[0]:
+                    email_ids = data[0].split()
+                    all_email_ids.update(email_ids)
+                    print(f"Found {len(email_ids)} emails with this criteria")
+            except Exception as e:
+                print(f"Search criteria failed: {search_criteria}, error: {e}")
+                continue
+        print(f"Total unique emails found: {len(all_email_ids)}")
+        scraped_emails = []
+        # Process each email
+        for i, email_id in enumerate(all_email_ids):
+            try:
+                print(f"Processing email {i+1}/{len(all_email_ids)}")
+                # Fetch email
+                status, msg_data = mail.fetch(email_id, "(RFC822)")
+                if status != 'OK':
+                    continue
+                # Parse email
+                msg = message_from_bytes(msg_data[0][1])
+                # Extract information
+                subject = msg.get("Subject", "No Subject")
+                from_header = msg.get("From", "Unknown Sender")
+                content = _email_to_clean_text(msg)
+                # Check if the keyword is actually present (case-insensitive)
+                keyword_lower = keyword.lower()
+                if not any(keyword_lower in text.lower() for text in [subject, from_header, content]):
+                    continue
+                # Parse date
+                date_header = msg.get("Date", "")
+                if date_header:
+                    try:
+                        dt_obj = parsedate_to_datetime(date_header)
+                        # Convert to IST
+                        ist_dt = dt_obj.astimezone(ZoneInfo("Asia/Kolkata"))
+                        email_date = ist_dt.strftime("%d-%b-%Y")
+                        email_time = ist_dt.strftime("%H:%M:%S")
+                    except:
+                        email_date = datetime.today().strftime("%d-%b-%Y")
+                        email_time = "00:00:00"
+                else:
+                    email_date = datetime.today().strftime("%d-%b-%Y")
+                    email_time = "00:00:00"
+                # Double-check date range
+                if not _is_date_in_range(email_date, start_date, end_date):
+                    continue
+                # Get message ID for deduplication
+                message_id = msg.get("Message-ID", f"missing-{email_id.decode()}")
+                scraped_emails.append({
+                    "date": email_date,
+                    "time": email_time,
+                    "subject": subject,
+                    "from": from_header,
+                    "content": content[:2000],  # Limit content length
+                    "message_id": message_id
+                })
+            except Exception as e:
+                print(f"Error processing email {email_id}: {e}")
+                continue
+        mail.logout()
+        # Sort by date (newest first)
+        scraped_emails.sort(key=lambda x: datetime.strptime(f"{x['date']} {x['time']}", "%d-%b-%Y %H:%M:%S"), reverse=True)
+        print(f"Successfully processed {len(scraped_emails)} emails containing '{keyword}'")
+        return scraped_emails
+    except Exception as e:
+        print(f"Email text search failed: {e}")
+        raise
 # Test the scraper
 if __name__ == "__main__":
     # Test scraping

agentic_implementation/name_mapping.json CHANGED Viewed

@@ -1,3 +1,4 @@
 {
-  "dev agarwal": "agarwal.27@iitj.ac.in"
 }

 {
+  "dev agarwal": "agarwal.27@iitj.ac.in",
+  "axis bank": "alerts@axisbank.com"
 }

agentic_implementation/re_act.py CHANGED Viewed

@@ -26,7 +26,7 @@ NAME_MAPPING_FILE = "name_mapping.json"
 SYSTEM_PLAN_PROMPT = """
 You are an email assistant agent. You have access to the following actions:
-  • fetch_emails - fetch emails based on sender and date criteria (includes date extraction)
   • show_email - display specific email content
   • analyze_emails - analyze email patterns or content
   • draft_reply - create a reply to an email
@@ -44,11 +44,11 @@ When the user gives you a query, output _only_ valid JSON of this form:
 }
 Rules:
-- Use "fetch_emails" when you need to retrieve emails (it automatically handles date extraction)
 - The final entry _must_ be "done"
 - If no tool is needed, return `{"plan":["done"]}`
-Example: For "show me emails from dev today" → ["fetch_emails", "done"]
 """
 SYSTEM_VALIDATOR_TEMPLATE = """
@@ -182,31 +182,15 @@ def think(
 ) -> Tuple[bool, Optional[PlanStep], Optional[str]]:
     """
     Fill in parameters or skip based on the action:
-     - fetch_emails: extract sender and pass the raw query for date extraction
      - others: ask the LLM validator for params
     Returns: (should_execute, updated_step, user_prompt_if_needed)
     """
-    # 1) fetch_emails → extract sender and pass query for internal date extraction
     if step.action == "fetch_emails":
-        # Extract sender using LLM
-        sender_info = extract_sender_info(user_query)
-        sender_intent = sender_info.get("sender_intent", "")
-        if not sender_intent:
-            return False, None, None
-        # Resolve sender to email address
-        email_address, needs_input = resolve_sender_email(sender_intent)
-        if needs_input:
-            # Need user input for email address
-            prompt_msg = f"I don't have an email address for '{sender_intent}'. Please provide the email address:"
-            return False, None, prompt_msg
         params = FetchEmailsParams(
-            email=email_address,
-            query=user_query  # Pass the full query for date extraction
         )
         return True, PlanStep(action="fetch_emails", parameters=params), None

 SYSTEM_PLAN_PROMPT = """
 You are an email assistant agent. You have access to the following actions:
+  • fetch_emails - fetch emails using text search with sender keywords and date extraction (e.g., "swiggy emails last week")
   • show_email - display specific email content
   • analyze_emails - analyze email patterns or content
   • draft_reply - create a reply to an email
 }
 Rules:
+- Use "fetch_emails" for text-based email search (automatically extracts sender keywords and dates)
 - The final entry _must_ be "done"
 - If no tool is needed, return `{"plan":["done"]}`
+Example: For "show me emails from swiggy today" → ["fetch_emails", "done"]
 """
 SYSTEM_VALIDATOR_TEMPLATE = """
 ) -> Tuple[bool, Optional[PlanStep], Optional[str]]:
     """
     Fill in parameters or skip based on the action:
+     - fetch_emails: pass the raw query for text-based search and date extraction
      - others: ask the LLM validator for params
     Returns: (should_execute, updated_step, user_prompt_if_needed)
     """
+    # 1) fetch_emails → pass the full query for text-based search and date extraction
     if step.action == "fetch_emails":
         params = FetchEmailsParams(
+            query=user_query  # Pass the full query for keyword and date extraction
         )
         return True, PlanStep(action="fetch_emails", parameters=params), None

agentic_implementation/schemas.py CHANGED Viewed

@@ -6,8 +6,7 @@ from typing import List, Literal, Optional, Union
 class FetchEmailsParams(BaseModel):
-    email: str
-    query: str  # Changed from start_date/end_date to query for internal date extraction
 class ShowEmailParams(BaseModel):

 class FetchEmailsParams(BaseModel):
+    query: str  # Natural language query with sender and date info (e.g., "show me mails for last week from swiggy")
 class ShowEmailParams(BaseModel):

agentic_implementation/tools.py CHANGED Viewed

@@ -6,8 +6,8 @@ from schemas import (
     SendReplyParams,
 )
 from typing import Any, Dict
-from email_scraper import scrape_emails_from_sender, _load_email_db, _save_email_db, _is_date_in_range
-from datetime import datetime
 from typing import List
 from openai import OpenAI
 import json
@@ -22,40 +22,48 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 client = OpenAI(api_key=OPENAI_API_KEY)
-def extract_date_range(query: str) -> Dict[str, str]:
     """
-    Use an LLM to extract a date range from a user query.
-    Returns {"start_date":"DD-MMM-YYYY","end_date":"DD-MMM-YYYY"}.
     """
     today_str = datetime.today().strftime("%d-%b-%Y")
     system_prompt = f"""
-You are a date‐range extractor. Today is {today_str}.
-Given a user query (in natural language), return _only_ valid JSON with:
-  {{
-    "start_date": "DD-MMM-YYYY",
-    "end_date":   "DD-MMM-YYYY"
-  }}
-Interpret relative dates as:
-- "today"       → {today_str} to {today_str}
-- "yesterday"   → 1 day ago to 1 day ago
-- "last week"   → 7 days ago to {today_str}
-- "last month"  → 30 days ago to {today_str}
-- "last N days" → N days ago to {today_str}
 Examples:
-- "emails from dev agarwal last week"
-  → {{ "start_date": "01-Jun-2025", "end_date": "{today_str}" }}
-- "show me emails yesterday"
-  → {{ "start_date": "06-Jun-2025", "end_date": "06-Jun-2025" }}
 Return _only_ the JSON object—no extra text.
 """
     messages = [
-        {"role": "system",  "content": system_prompt},
-        {"role": "user",    "content": query}
     ]
     resp = client.chat.completions.create(
         model="gpt-4o-mini",
@@ -73,31 +81,58 @@ Return _only_ the JSON object—no extra text.
         return json.loads(content[start:end])
-def fetch_emails(email: str, query: str) -> Dict:
     """
-    Fetch emails from a sender within a date range extracted from the query.
-    Now returns both date info and emails.
     Args:
-        email: The sender's email address
-        query: The original user query (for date extraction)
     Returns:
-        Dict with date_info and emails
     """
-    # Extract date range from query
-    date_info = extract_date_range(query)
-    start_date = date_info.get("start_date")
-    end_date = date_info.get("end_date")
-    # Fetch emails using the existing scraper
-    emails = scrape_emails_from_sender(email, start_date, end_date)
-    # Return both date info and emails
     return {
-        "date_info": date_info,
-        "emails": emails,
-        "email_count": len(emails)
     }
@@ -141,18 +176,34 @@ def analyze_emails(emails: List[Dict]) -> Dict:
         "insights": [str, ...] # list of key observations or stats
       }
     """
-    # 1) Prepare the email payload
-    emails_payload = json.dumps(emails, ensure_ascii=False)
     # 2) Build the LLM prompt
     system_prompt = """
 You are an expert email analyst. You will be given a JSON array of email objects,
-each with keys: date, time, subject, content, message_id.
 Your job is to produce _only_ valid JSON with two fields:
 1. summary: a 1–2 sentence high-level overview of these emails.
 2. insights: a list of 3–5 bullet-style observations or statistics
-   (e.g. "2 job offers found", "overall positive tone", "next action: reply").
 Output exactly:

     SendReplyParams,
 )
 from typing import Any, Dict
+from email_scraper import scrape_emails_from_sender, scrape_emails_by_text_search, _load_email_db, _save_email_db, _is_date_in_range
+from datetime import datetime, timedelta
 from typing import List
 from openai import OpenAI
 import json
 client = OpenAI(api_key=OPENAI_API_KEY)
+def extract_query_info(query: str) -> Dict[str, str]:
     """
+    Use an LLM to extract sender information and date range from a user query.
+    Returns {"sender_keyword": "company/sender name", "start_date":"DD-MMM-YYYY","end_date":"DD-MMM-YYYY"}.
     """
     today_str = datetime.today().strftime("%d-%b-%Y")
+    five_days_ago = (datetime.today() - timedelta(days=5)).strftime("%d-%b-%Y")
     system_prompt = f"""
+You are a query parser for email search. Today is {today_str}.
+Given a user query, extract the sender/company keyword and date range. Return _only_ valid JSON with:
+{{
+  "sender_keyword": "keyword or company name to search for",
+  "start_date": "DD-MMM-YYYY",
+  "end_date": "DD-MMM-YYYY"
+}}
+Rules:
+1. Extract sender keywords from phrases like "from swiggy", "swiggy emails", "mails from amazon", etc.
+2. If no time is mentioned, use last 5 days: {five_days_ago} to {today_str}
+3. Interpret relative dates as:
+   - "today" → {today_str} to {today_str}
+   - "yesterday" → 1 day ago to 1 day ago
+   - "last week" → 7 days ago to {today_str}
+   - "last month" → 30 days ago to {today_str}
+   - "last N days" → N days ago to {today_str}
 Examples:
+- "show me mails for last week from swiggy"
+  → {{"sender_keyword": "swiggy", "start_date": "01-Jun-2025", "end_date": "{today_str}"}}
+- "emails from amazon yesterday"
+  → {{"sender_keyword": "amazon", "start_date": "06-Jun-2025", "end_date": "06-Jun-2025"}}
+- "show flipkart emails"
+  → {{"sender_keyword": "flipkart", "start_date": "{five_days_ago}", "end_date": "{today_str}"}}
 Return _only_ the JSON object—no extra text.
 """
     messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": query}
     ]
     resp = client.chat.completions.create(
         model="gpt-4o-mini",
         return json.loads(content[start:end])
+def fetch_emails(query: str) -> Dict:
     """
+    Fetch emails based on a natural language query that contains sender information and date range.
+    Now uses text-based search and returns only summary information, not full content.
     Args:
+        query: The natural language query (e.g., "show me mails for last week from swiggy")
     Returns:
+        Dict with query_info, email_summary, analysis, and email_count
     """
+    # Extract sender keyword and date range from query
+    query_info = extract_query_info(query)
+    sender_keyword = query_info.get("sender_keyword", "")
+    start_date = query_info.get("start_date")
+    end_date = query_info.get("end_date")
+    print(f"Searching for emails with keyword '{sender_keyword}' between {start_date} and {end_date}")
+    # Use the new text-based search function
+    full_emails = scrape_emails_by_text_search(sender_keyword, start_date, end_date)
+    if not full_emails:
+        return {
+            "query_info": query_info,
+            "email_summary": [],
+            "analysis": {"summary": f"No emails found for '{sender_keyword}' in the specified date range.", "insights": []},
+            "email_count": 0
+        }
+    # Create summary version without full content
+    email_summary = []
+    for email in full_emails:
+        summary_email = {
+            "date": email.get("date"),
+            "time": email.get("time"),
+            "subject": email.get("subject"),
+            "from": email.get("from", "Unknown Sender"),
+            "message_id": email.get("message_id")
+            # Note: Removed 'content' to keep response clean
+        }
+        email_summary.append(summary_email)
+    # Auto-analyze the emails for insights
+    analysis = analyze_emails(full_emails)  # Use full emails for analysis but don't return them
+    # Return summary info with analysis
     return {
+        "query_info": query_info,
+        "email_summary": email_summary,
+        "analysis": analysis,
+        "email_count": len(full_emails)
     }
         "insights": [str, ...] # list of key observations or stats
       }
     """
+    if not emails:
+        return {"summary": "No emails to analyze.", "insights": []}
+    # 1) Create a simplified email summary for analysis (without full content)
+    simplified_emails = []
+    for email in emails:
+        simplified_email = {
+            "date": email.get("date"),
+            "time": email.get("time"),
+            "subject": email.get("subject"),
+            "from": email.get("from", "Unknown Sender"),
+            "content_preview": email.get("content", "")[:200] + "..." if email.get("content") else ""
+        }
+        simplified_emails.append(simplified_email)
+    emails_payload = json.dumps(simplified_emails, ensure_ascii=False)
     # 2) Build the LLM prompt
     system_prompt = """
 You are an expert email analyst. You will be given a JSON array of email objects,
+each with keys: date, time, subject, from, content_preview.
 Your job is to produce _only_ valid JSON with two fields:
 1. summary: a 1–2 sentence high-level overview of these emails.
 2. insights: a list of 3–5 bullet-style observations or statistics
+   (e.g. "5 emails from Swiggy", "mostly promotional content", "received over 3 days").
+Focus on metadata like senders, subjects, dates, and patterns rather than detailed content analysis.
 Output exactly: