Spaces:
Sleeping
Sleeping
| from flask import Flask, jsonify, request, Response, stream_with_context | |
| from flask_cors import CORS | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import re | |
| import urllib.parse | |
| import time | |
| import random | |
| import base64 | |
| from io import BytesIO | |
| from googlesearch import search | |
| import logging | |
| import queue | |
| from huggingface_hub import HfApi | |
| # Create a logging filter to suppress socket warnings | |
| class SocketWarningFilter(logging.Filter): | |
| def filter(self, record): | |
| return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage()) | |
| # Create a queue for log messages | |
| log_queue = queue.Queue() | |
| # Custom log handler that puts messages in the queue | |
| class QueueHandler(logging.Handler): | |
| def emit(self, record): | |
| log_entry = self.format(record) | |
| log_queue.put(log_entry) | |
| # Set up logging with the custom handler | |
| logger = logging.getLogger() | |
| queue_handler = QueueHandler() | |
| queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) | |
| queue_handler.addFilter(SocketWarningFilter()) # Add the filter to the handler | |
| logger.addHandler(queue_handler) | |
| logger.setLevel(logging.INFO) | |
| # Also add the filter to the root logger to catch all socket warnings | |
| logging.getLogger().addFilter(SocketWarningFilter()) | |
| app = Flask(__name__) | |
| # Enable CORS with specific settings | |
| CORS(app, resources={ | |
| r"/*": { | |
| "origins": "*", | |
| "methods": ["GET", "POST", "OPTIONS"], | |
| "allow_headers": ["Content-Type", "Authorization"] | |
| } | |
| }) | |
| def search_images(query, num_images=5): | |
| # Headers to mimic a browser request | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Accept-Encoding': 'gzip, deflate', | |
| 'DNT': '1', | |
| 'Connection': 'keep-alive', | |
| } | |
| # Format the query for URL | |
| formatted_query = urllib.parse.quote(query) | |
| # Google Images URL | |
| url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active" | |
| try: | |
| # Get the HTML content | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| # Find all image URLs using regex | |
| image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text) | |
| # Remove duplicates while preserving order | |
| image_urls = list(dict.fromkeys(image_urls)) | |
| # Store results | |
| results = [] | |
| downloaded = 0 | |
| for img_url in image_urls: | |
| if downloaded >= num_images: | |
| break | |
| try: | |
| # Skip small thumbnails and icons | |
| if 'gstatic.com' in img_url or 'google.com' in img_url: | |
| continue | |
| # Download image | |
| img_response = requests.get(img_url, headers=headers, timeout=10) | |
| img_response.raise_for_status() | |
| # Check if the response is actually an image | |
| content_type = img_response.headers.get('Content-Type', '') | |
| if not content_type.startswith('image/'): | |
| continue | |
| # Convert image to base64 | |
| image_base64 = base64.b64encode(img_response.content).decode('utf-8') | |
| # Add to results | |
| results.append({ | |
| 'image_url': img_url, | |
| 'base64_data': f"data:{content_type};base64,{image_base64}" | |
| }) | |
| downloaded += 1 | |
| # Add a random delay between downloads | |
| time.sleep(random.uniform(0.5, 1)) | |
| except Exception as e: | |
| logger.error(f"Error downloading image: {str(e)}") | |
| continue | |
| return results | |
| except Exception as e: | |
| logger.error(f"An error occurred: {str(e)}") | |
| return [] | |
| HF_TOKEN = os.getenv("HF_TOKEN") # Make sure you set the HF_TOKEN in your environment | |
| def api_restart_space(): | |
| """API route to restart a Hugging Face Space.""" | |
| space_id = 'Pamudu13/web-scraper' | |
| factory_reboot = request.json.get('factory_reboot', False) # Optional: Set to True if you want a factory reboot | |
| if not space_id: | |
| return jsonify({'error': 'space_id parameter is required'}), 400 | |
| try: | |
| hfapi = HfApi() | |
| # Call the restart_space method | |
| res = hfapi.restart_space( | |
| space_id, | |
| token=HF_TOKEN, | |
| factory_reboot=factory_reboot | |
| ) | |
| return jsonify({ | |
| 'success': True, | |
| 'message': f"Successfully restarted Space: {space_id}", | |
| 'response': res | |
| }), 200 | |
| except Exception as e: | |
| return jsonify({ | |
| 'success': False, | |
| 'message': f"Error: {str(e)}" | |
| }), 500 | |
| def get_live_space_status(): | |
| """API route to stream live status of a Hugging Face Space.""" | |
| space_id = request.args.get('space_id', 'Pamudu13/web-scraper') # Default to 'Pamudu13/web-scraper' if not provided | |
| def generate(): | |
| while True: | |
| try: | |
| # Fetch the current runtime status of the Space | |
| hf_api = HfApi() | |
| space_runtime = hf_api.get_space_runtime(repo_id=space_id) | |
| # Extract relevant details | |
| status = space_runtime.stage # e.g., 'BUILDING', 'RUNNING', etc. | |
| hardware = space_runtime.hardware # e.g., 'cpu-basic', 't4-medium', etc. | |
| # Send the status as a Server-Sent Event | |
| yield f"data: {status}\n\n" | |
| yield f"data: {hardware}\n\n" | |
| # Delay before checking the status again | |
| time.sleep(5) # Adjust polling interval as needed | |
| except Exception as e: | |
| # Handle errors and send an error message | |
| yield f"data: Error: {str(e)}\n\n" | |
| break # Stop the stream in case of an error | |
| return Response(stream_with_context(generate()), mimetype='text/event-stream') | |
| def api_search_images(): | |
| try: | |
| # Get query parameters | |
| query = request.args.get('query', '') | |
| num_images = int(request.args.get('num_images', 5)) | |
| if not query: | |
| return jsonify({'error': 'Query parameter is required'}), 400 | |
| if num_images < 1 or num_images > 20: | |
| return jsonify({'error': 'Number of images must be between 1 and 20'}), 400 | |
| # Search for images | |
| results = search_images(query, num_images) | |
| response = jsonify({ | |
| 'success': True, | |
| 'query': query, | |
| 'results': results | |
| }) | |
| # Add CORS headers | |
| response.headers['Access-Control-Allow-Origin'] = '*' | |
| return response | |
| except Exception as e: | |
| logger.error(f"Error in search_images: {str(e)}") | |
| response = jsonify({ | |
| 'success': False, | |
| 'error': str(e) | |
| }), 500 | |
| # Add CORS headers | |
| response.headers['Access-Control-Allow-Origin'] = '*' | |
| return response | |
| def scrape_site_content(query, num_sites=5): | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Accept-Encoding': 'gzip, deflate', | |
| 'DNT': '1', | |
| 'Connection': 'keep-alive', | |
| } | |
| results = [] | |
| scraped = 0 | |
| retries = 2 # Number of retries per URL | |
| timeout = 5 # Reduced timeout to 5 seconds | |
| try: | |
| # Get more URLs than needed to account for failures | |
| search_results = list(search(query, num_results=num_sites * 2)) | |
| # Process each found URL | |
| for url in search_results: | |
| if scraped >= num_sites: | |
| break | |
| success = False | |
| for attempt in range(retries): | |
| try: | |
| # Get the HTML content | |
| logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})") | |
| logger.info(f"Scraping URL: {url}") | |
| response = requests.get( | |
| url, | |
| headers=headers, | |
| timeout=timeout, | |
| verify=False # Skip SSL verification | |
| ) | |
| response.raise_for_status() | |
| # Verify it's HTML content | |
| content_type = response.headers.get('Content-Type', '').lower() | |
| if 'text/html' not in content_type: | |
| logger.info(f"Skipping {url} - not HTML content") | |
| break | |
| # Parse the HTML content | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Remove script and style elements | |
| for script in soup(["script", "style"]): | |
| script.decompose() | |
| # Extract text content (limit to first 10000 characters) | |
| text_content = soup.get_text(separator='\n', strip=True)[:10000] | |
| # Skip if not enough content | |
| if len(text_content.split()) < 100: # Skip if less than 100 words | |
| logger.info(f"Skipping {url} - not enough content") | |
| break | |
| # Extract all links (limit to first 10) | |
| links = [] | |
| for link in soup.find_all('a', href=True)[:10]: | |
| href = link['href'] | |
| if href.startswith('http'): | |
| links.append({ | |
| 'text': link.get_text(strip=True), | |
| 'url': href | |
| }) | |
| # Extract meta information | |
| title = soup.title.string if soup.title else '' | |
| meta_description = '' | |
| meta_keywords = '' | |
| meta_desc_tag = soup.find('meta', attrs={'name': 'description'}) | |
| if meta_desc_tag: | |
| meta_description = meta_desc_tag.get('content', '') | |
| meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'}) | |
| if meta_keywords_tag: | |
| meta_keywords = meta_keywords_tag.get('content', '') | |
| results.append({ | |
| 'url': url, | |
| 'title': title, | |
| 'meta_description': meta_description, | |
| 'meta_keywords': meta_keywords, | |
| 'text_content': text_content, | |
| 'links': links | |
| }) | |
| scraped += 1 | |
| success = True | |
| # Add a random delay between scrapes | |
| time.sleep(random.uniform(0.5, 1)) | |
| break # Break retry loop on success | |
| except requests.Timeout: | |
| print(f"Timeout on {url} (attempt {attempt + 1}/{retries})") | |
| if attempt == retries - 1: # Last attempt | |
| print(f"Skipping {url} after {retries} timeout attempts") | |
| except requests.RequestException as e: | |
| print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}") | |
| if attempt == retries - 1: # Last attempt | |
| print(f"Skipping {url} after {retries} failed attempts") | |
| # Add a longer delay between retries | |
| if not success and attempt < retries - 1: | |
| time.sleep(random.uniform(1, 2)) | |
| # If we haven't found enough valid content and have more URLs, continue | |
| if scraped < num_sites and len(results) < len(search_results): | |
| continue | |
| return results | |
| except Exception as e: | |
| print(f"Error in search/scraping process: {str(e)}") | |
| # Return whatever results we've managed to gather | |
| return results | |
| def api_scrape_sites(): | |
| try: | |
| # Get query parameters | |
| query = request.args.get('query', '') | |
| num_sites = int(request.args.get('num_sites', 10)) | |
| if not query: | |
| return jsonify({'error': 'Query parameter is required'}), 400 | |
| if num_sites < 1 or num_sites > 20: | |
| return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400 | |
| # Scrape the websites | |
| results = scrape_site_content(query, num_sites) | |
| response = jsonify({ | |
| 'success': True, | |
| 'query': query, | |
| 'results': results | |
| }) | |
| # Add CORS headers | |
| response.headers['Access-Control-Allow-Origin'] = '*' | |
| return response | |
| except Exception as e: | |
| logger.error(f"Error in api_scrape_sites: {str(e)}") | |
| response = jsonify({ | |
| 'success': False, | |
| 'error': str(e) | |
| }), 500 | |
| # Add CORS headers | |
| response.headers['Access-Control-Allow-Origin'] = '*' | |
| return response | |
| def stream_logs(): | |
| def generate(): | |
| while True: | |
| try: | |
| # Get log message from queue, timeout after 1 second | |
| log_message = log_queue.get(timeout=1) | |
| yield f"data: {log_message}\n\n" | |
| except queue.Empty: | |
| # Send a heartbeat to keep the connection alive | |
| yield "data: heartbeat\n\n" | |
| except GeneratorExit: | |
| break | |
| response = Response(stream_with_context(generate()), mimetype='text/event-stream') | |
| response.headers['Cache-Control'] = 'no-cache' | |
| response.headers['Connection'] = 'keep-alive' | |
| return response | |
| if __name__ == '__main__': | |
| logger.info("Starting Flask API server...") | |
| app.run(host='0.0.0.0', port=5001, debug=True) | |