Spaces:

shukdevdatta123
/

AI-WS

Paused

App Files Files Community

shukdevdatta123 commited on May 28, 2025

Commit

c01e49c

verified ·

1 Parent(s): 6457b1e

Update app.py

Browse files

Files changed (1) hide show

app.py +246 -97

app.py CHANGED Viewed

@@ -6,6 +6,13 @@ import json
 import re
 from urllib.parse import urljoin, urlparse
 import time
 class WebScrapingTool:
     def __init__(self):
@@ -48,69 +55,206 @@ Your role is to act as an intelligent browser and data interpreter — able to r
         except Exception as e:
             return False, f"Failed to initialize API client: {str(e)}"
     def scrape_webpage(self, url):
-        """Scrape webpage content"""
         try:
-            headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-            }
-            response = requests.get(url, headers=headers, timeout=30)
-            response.raise_for_status()
             soup = BeautifulSoup(response.content, 'html.parser')
-            # Remove script and style elements
-            for script in soup(["script", "style", "nav", "footer", "header"]):
-                script.decompose()
             # Extract text content
-            text_content = soup.get_text()
-            # Clean up text
-            lines = (line.strip() for line in text_content.splitlines())
-            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-            text_content = ' '.join(chunk for chunk in chunks if chunk)
-            # Extract tables
             tables = []
-            for table in soup.find_all('table'):
                 table_data = []
                 headers = []
-                # Extract headers
-                header_row = table.find('tr')
                 if header_row:
-                    headers = [th.get_text().strip() for th in header_row.find_all(['th', 'td'])]
-                # Extract rows
-                for row in table.find_all('tr')[1:]:  # Skip header row
-                    row_data = [td.get_text().strip() for td in row.find_all(['td', 'th'])]
-                    if row_data:
-                        table_data.append(row_data)
-                if headers and table_data:
                     tables.append({
                         'headers': headers,
-                        'data': table_data
                     })
             return {
                 'success': True,
-                'text': text_content[:15000],  # Limit text length
                 'tables': tables,
-                'title': soup.title.string if soup.title else "No title found"
             }
-        except requests.RequestException as e:
             return {
                 'success': False,
-                'error': f"Failed to fetch webpage: {str(e)}"
             }
         except Exception as e:
             return {
                 'success': False,
-                'error': f"Error processing webpage: {str(e)}"
             }
     def analyze_content(self, scraped_data, user_query, api_key):
@@ -125,23 +269,36 @@ Your role is to act as an intelligent browser and data interpreter — able to r
         # Prepare content for AI analysis
         content_text = f"""
-WEBPAGE CONTENT:
 Title: {scraped_data['title']}
-Main Text Content:
-{scraped_data['text']}
-Tables Found: {len(scraped_data['tables'])}
 """
         if scraped_data['tables']:
-            content_text += "\n\nTABLES:\n"
-            for i, table in enumerate(scraped_data['tables']):
-                content_text += f"\nTable {i+1}:\n"
-                content_text += f"Headers: {', '.join(table['headers'])}\n"
-                content_text += "Data:\n"
-                for row in table['data'][:10]:  # Limit rows
-                    content_text += f"  {' | '.join(row)}\n"
         try:
             completion = self.client.chat.completions.create(
@@ -152,7 +309,7 @@ Tables Found: {len(scraped_data['tables'])}
                 model="deepseek/deepseek-chat-v3-0324:free",
                 messages=[
                     {"role": "system", "content": self.system_prompt},
-                    {"role": "user", "content": f"Here is the webpage content:\n\n{content_text}\n\nUser Query: {user_query}"}
                 ],
                 temperature=0.1,
                 max_tokens=4000
@@ -161,7 +318,7 @@ Tables Found: {len(scraped_data['tables'])}
             return completion.choices[0].message.content
         except Exception as e:
-            return f"Error analyzing content: {str(e)}"
 def create_interface():
     tool = WebScrapingTool()
@@ -176,22 +333,29 @@ def create_interface():
         if not user_query.strip():
             return "❌ Please enter your analysis query"
         # Add progress updates
-        yield "🔄 Scraping webpage content..."
         # Scrape webpage
         scraped_data = tool.scrape_webpage(url)
         if not scraped_data['success']:
-            yield f"❌ {scraped_data['error']}"
             return
-        yield f"✅ Successfully scraped webpage!\n📄 Title: {scraped_data['title']}\n📊 Found {len(scraped_data['tables'])} tables\n\n🤖 Analyzing content with DeepSeek V3..."
         # Analyze content
         result = tool.analyze_content(scraped_data, user_query, api_key)
-        yield f"✅ Analysis Complete!\n\n{result}"
     # Create Gradio interface
     with gr.Blocks(title="AI Web Scraping Tool", theme=gr.themes.Soft()) as app:
@@ -199,7 +363,7 @@ def create_interface():
         # 🤖 AI Web Scraping Tool
         ### Powered by DeepSeek V3 & OpenRouter
-        Extract and analyze web content using advanced AI. Simply provide your OpenRouter API key, a URL, and describe what you want to extract.
         """)
         with gr.Row():
@@ -213,51 +377,56 @@ def create_interface():
                 url_input = gr.Textbox(
                     label="🌐 Website URL",
-                    placeholder="https://example.com",
                     info="Enter the URL you want to scrape and analyze"
                 )
                 query_input = gr.Textbox(
                     label="📝 Analysis Query",
                     placeholder="What do you want to extract? (e.g., 'Extract main points and create a summary table')",
-                    lines=3,
                     info="Describe what information you want to extract from the webpage"
                 )
                 with gr.Row():
                     analyze_btn = gr.Button("🚀 Analyze Website", variant="primary", size="lg")
-                    clear_btn = gr.Button("🗑️ Clear", variant="secondary")
             with gr.Column(scale=3):
                 output = gr.Textbox(
                     label="📊 Analysis Results",
-                    lines=20,
-                    max_lines=30,
                     show_copy_button=True,
-                    interactive=False
                 )
-        # Example queries
-        gr.Markdown("""
-        ### 💡 Example Queries:
-        - *"Extract the main summary and any data tables"*
-        - *"Create a table of key statistics mentioned in the article"*
-        - *"Summarize the main points in bullet format"*
-        - *"Extract all numerical data and organize it in a table"*
-        - *"Find and extract contact information and company details"*
-        """)
-        # Example websites
-        with gr.Accordion("📋 Try These Example URLs", open=False):
-            examples = [
-                ["https://www.imf.org/en/Publications/WEO", "Extract economic outlook summary and GDP projections"],
-                ["https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)", "Create a table of top 10 countries by GDP"],
-                ["https://www.who.int/news", "Summarize the latest health news"],
-                ["https://www.nasdaq.com/market-activity/stocks", "Extract stock market data and trends"]
-            ]
-            for url, query in examples:
-                gr.Markdown(f"**URL:** `{url}`  \n**Query:** *{query}*")
         # Event handlers
         analyze_btn.click(
@@ -271,26 +440,6 @@ def create_interface():
             fn=lambda: ("", "", "", ""),
             outputs=[api_key_input, url_input, query_input, output]
         )
-        # Auto-fill example
-        def fill_example():
-            return (
-                "",  # API key remains empty
-                "https://www.imf.org/en/Publications/WEO/Issues/2024/04/16/world-economic-outlook-april-2024",
-                """1. Extract a summary of the main economic outlook from this page.
-2. Extract any available tables or figures with global GDP growth projections.
-3. Create a new table showing:
-   - Country/Region
-   - Projected GDP Growth (2024)
-   - Change from Previous Forecast (if available)
-4. Highlight the top 3 fastest-growing economies in a separate mini-table."""
-            )
-        example_btn = gr.Button("📋 Load IMF Example", variant="secondary")
-        example_btn.click(
-            fn=fill_example,
-            outputs=[url_input, query_input]
-        )
     return app
@@ -298,7 +447,7 @@ if __name__ == "__main__":
     # Create and launch the app
     app = create_interface()
-    # Launch with public sharing enabled
     app.launch(
-        share=True,
     )

 import re
 from urllib.parse import urljoin, urlparse
 import time
+import urllib3
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+import ssl
+# Disable SSL warnings
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 class WebScrapingTool:
     def __init__(self):
         except Exception as e:
             return False, f"Failed to initialize API client: {str(e)}"
+    def create_session(self):
+        """Create a robust session with retry strategy and proper headers"""
+        session = requests.Session()
+        # Define retry strategy
+        retry_strategy = Retry(
+            total=3,
+            status_forcelist=[429, 500, 502, 503, 504],
+            method_whitelist=["HEAD", "GET", "OPTIONS"],
+            backoff_factor=1
+        )
+        # Mount adapter with retry strategy
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        session.mount("http://", adapter)
+        session.mount("https://", adapter)
+        # Set comprehensive headers to mimic real browser
+        session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Sec-Fetch-User': '?1',
+            'Cache-Control': 'max-age=0'
+        })
+        return session
     def scrape_webpage(self, url):
+        """Scrape webpage content with enhanced error handling and timeouts"""
         try:
+            session = self.create_session()
+            # Multiple timeout attempts with increasing duration
+            timeout_attempts = [15, 30, 45]
+            for timeout in timeout_attempts:
+                try:
+                    print(f"Attempting to fetch {url} with {timeout}s timeout...")
+                    response = session.get(
+                        url,
+                        timeout=timeout,
+                        verify=False,  # Disable SSL verification for problematic sites
+                        allow_redirects=True,
+                        stream=False
+                    )
+                    response.raise_for_status()
+                    break
+                except requests.exceptions.Timeout:
+                    if timeout == timeout_attempts[-1]:  # Last attempt
+                        return {
+                            'success': False,
+                            'error': f"Connection timed out after multiple attempts. The website may be slow or blocking automated requests."
+                        }
+                    continue
+                except requests.exceptions.SSLError:
+                    # Try with different SSL context
+                    try:
+                        response = session.get(
+                            url,
+                            timeout=timeout,
+                            verify=False,
+                            allow_redirects=True
+                        )
+                        response.raise_for_status()
+                        break
+                    except:
+                        continue
+            # Check if we got a response
+            if 'response' not in locals():
+                return {
+                    'success': False,
+                    'error': "Failed to establish connection after multiple attempts"
+                }
+            # Check content type
+            content_type = response.headers.get('content-type', '').lower()
+            if 'text/html' not in content_type and 'text/plain' not in content_type:
+                return {
+                    'success': False,
+                    'error': f"Invalid content type: {content_type}. Expected HTML content."
+                }
+            # Parse HTML content
             soup = BeautifulSoup(response.content, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"]):
+                element.decompose()
+            # Remove elements with common ad/tracking classes
+            ad_classes = ['ad', 'advertisement', 'banner', 'popup', 'modal', 'cookie', 'newsletter']
+            for class_name in ad_classes:
+                for element in soup.find_all(class_=re.compile(class_name, re.I)):
+                    element.decompose()
             # Extract text content
+            text_content = soup.get_text(separator=' ', strip=True)
+            # Clean up text - remove extra whitespace
+            text_content = re.sub(r'\s+', ' ', text_content)
+            text_content = text_content.strip()
+            # Extract tables with improved structure
             tables = []
+            for i, table in enumerate(soup.find_all('table')):
                 table_data = []
                 headers = []
+                # Try to find headers in various ways
+                header_row = table.find('thead')
+                if header_row:
+                    header_row = header_row.find('tr')
+                else:
+                    header_row = table.find('tr')
                 if header_row:
+                    headers = []
+                    for th in header_row.find_all(['th', 'td']):
+                        header_text = th.get_text(strip=True)
+                        headers.append(header_text if header_text else f"Column_{len(headers)+1}")
+                # Extract all rows (skip header if it was already processed)
+                rows = table.find_all('tr')
+                start_idx = 1 if header_row and header_row in rows else 0
+                for row in rows[start_idx:]:
+                    cells = row.find_all(['td', 'th'])
+                    if cells:
+                        row_data = []
+                        for cell in cells:
+                            cell_text = cell.get_text(strip=True)
+                            row_data.append(cell_text)
+                        if row_data and any(cell.strip() for cell in row_data):  # Skip empty rows
+                            table_data.append(row_data)
+                if table_data:
+                    # Ensure headers match data columns
+                    max_cols = max(len(row) for row in table_data) if table_data else 0
+                    if len(headers) < max_cols:
+                        headers.extend([f"Column_{i+1}" for i in range(len(headers), max_cols)])
+                    elif len(headers) > max_cols:
+                        headers = headers[:max_cols]
                     tables.append({
+                        'id': i + 1,
                         'headers': headers,
+                        'data': table_data[:50]  # Limit rows to prevent overwhelming
                     })
+            # Extract metadata
+            title = soup.title.string.strip() if soup.title and soup.title.string else "No title found"
+            # Extract meta description
+            meta_desc = ""
+            desc_tag = soup.find('meta', attrs={'name': 'description'})
+            if desc_tag and desc_tag.get('content'):
+                meta_desc = desc_tag['content'].strip()
             return {
                 'success': True,
+                'text': text_content[:20000],  # Limit text length
                 'tables': tables,
+                'title': title,
+                'meta_description': meta_desc,
+                'url': url,
+                'content_length': len(text_content)
             }
+        except requests.exceptions.ConnectionError as e:
             return {
                 'success': False,
+                'error': f"Connection failed: {str(e)}. The website may be down or blocking requests."
+            }
+        except requests.exceptions.HTTPError as e:
+            return {
+                'success': False,
+                'error': f"HTTP Error {e.response.status_code}: {e.response.reason}"
+            }
+        except requests.exceptions.RequestException as e:
+            return {
+                'success': False,
+                'error': f"Request failed: {str(e)}"
             }
         except Exception as e:
             return {
                 'success': False,
+                'error': f"Unexpected error while processing webpage: {str(e)}"
             }
     def analyze_content(self, scraped_data, user_query, api_key):
         # Prepare content for AI analysis
         content_text = f"""
+WEBPAGE ANALYSIS REQUEST
+========================
+URL: {scraped_data['url']}
 Title: {scraped_data['title']}
+Content Length: {scraped_data['content_length']} characters
+Tables Found: {len(scraped_data['tables'])}
+META DESCRIPTION:
+{scraped_data['meta_description']}
+MAIN CONTENT:
+{scraped_data['text']}
 """
         if scraped_data['tables']:
+            content_text += f"\n\nSTRUCTURED DATA - {len(scraped_data['tables'])} TABLE(S) FOUND:\n"
+            content_text += "=" * 50 + "\n"
+            for table in scraped_data['tables']:
+                content_text += f"\nTABLE {table['id']}:\n"
+                content_text += f"Headers: {' | '.join(table['headers'])}\n"
+                content_text += "-" * 50 + "\n"
+                for i, row in enumerate(table['data'][:10]):  # Show first 10 rows
+                    content_text += f"Row {i+1}: {' | '.join(str(cell) for cell in row)}\n"
+                if len(table['data']) > 10:
+                    content_text += f"... and {len(table['data']) - 10} more rows\n"
+                content_text += "\n"
         try:
             completion = self.client.chat.completions.create(
                 model="deepseek/deepseek-chat-v3-0324:free",
                 messages=[
                     {"role": "system", "content": self.system_prompt},
+                    {"role": "user", "content": f"{content_text}\n\nUSER REQUEST:\n{user_query}\n\nPlease analyze the above webpage content and fulfill the user's request. Be thorough and accurate."}
                 ],
                 temperature=0.1,
                 max_tokens=4000
             return completion.choices[0].message.content
         except Exception as e:
+            return f"Error analyzing content with AI: {str(e)}"
 def create_interface():
     tool = WebScrapingTool()
         if not user_query.strip():
             return "❌ Please enter your analysis query"
+        # Validate URL format
+        if not url.startswith(('http://', 'https://')):
+            url = 'https://' + url
         # Add progress updates
+        yield "🔄 Initializing web scraper..."
+        time.sleep(0.5)
+        yield "🌐 Fetching webpage content (this may take a moment)..."
         # Scrape webpage
         scraped_data = tool.scrape_webpage(url)
         if not scraped_data['success']:
+            yield f"❌ Scraping Failed: {scraped_data['error']}"
             return
+        yield f"✅ Successfully scraped webpage!\n📄 Title: {scraped_data['title']}\n📊 Found {len(scraped_data['tables'])} tables\n📝 Content: {scraped_data['content_length']} characters\n\n🤖 Analyzing content with DeepSeek V3..."
         # Analyze content
         result = tool.analyze_content(scraped_data, user_query, api_key)
+        yield f"✅ Analysis Complete!\n{'='*50}\n\n{result}"
     # Create Gradio interface
     with gr.Blocks(title="AI Web Scraping Tool", theme=gr.themes.Soft()) as app:
         # 🤖 AI Web Scraping Tool
         ### Powered by DeepSeek V3 & OpenRouter
+        Extract and analyze web content using advanced AI. The tool handles timeouts, SSL issues, and provides robust scraping capabilities.
         """)
         with gr.Row():
                 url_input = gr.Textbox(
                     label="🌐 Website URL",
+                    placeholder="https://example.com or just example.com",
                     info="Enter the URL you want to scrape and analyze"
                 )
                 query_input = gr.Textbox(
                     label="📝 Analysis Query",
                     placeholder="What do you want to extract? (e.g., 'Extract main points and create a summary table')",
+                    lines=4,
                     info="Describe what information you want to extract from the webpage"
                 )
                 with gr.Row():
                     analyze_btn = gr.Button("🚀 Analyze Website", variant="primary", size="lg")
+                    clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
             with gr.Column(scale=3):
                 output = gr.Textbox(
                     label="📊 Analysis Results",
+                    lines=25,
+                    max_lines=40,
                     show_copy_button=True,
+                    interactive=False,
+                    placeholder="Results will appear here after analysis..."
                 )
+        # Tips and Examples
+        with gr.Accordion("💡 Usage Tips & Examples", open=False):
+            gr.Markdown("""
+            ### 🎯 Example Analysis Queries:
+            - **Data Extraction**: *"Extract all numerical data and organize it in a table format"*
+            - **Content Summary**: *"Summarize the main points in bullet format with key statistics"*
+            - **Table Processing**: *"Find all tables and convert them to a single consolidated format"*
+            - **Specific Information**: *"Extract contact information, prices, or product details"*
+            - **Comparison**: *"Compare different items/options mentioned and create a comparison table"*
+            ### 🔧 Technical Notes:
+            - **Multiple Timeouts**: Tool tries 15s, 30s, then 45s timeouts automatically
+            - **SSL Handling**: Bypasses SSL issues for problematic websites
+            - **Content Filtering**: Removes ads, popups, and unnecessary elements
+            - **Table Detection**: Automatically finds and structures tabular data
+            - **Error Recovery**: Handles connection issues and provides clear error messages
+            ### 🌐 Works Well With:
+            - News websites (BBC, CNN, Reuters)
+            - Government sites (IMF, WHO, official statistics)
+            - Wikipedia and educational content
+            - E-commerce product pages
+            - Financial data sites (Yahoo Finance, MarketWatch)
+            - Research papers and academic sites
+            """)
         # Event handlers
         analyze_btn.click(
             fn=lambda: ("", "", "", ""),
             outputs=[api_key_input, url_input, query_input, output]
         )
     return app
     # Create and launch the app
     app = create_interface()
+    # Launch with enhanced configuration
     app.launch(
+        share=True
     )