Spaces:
Sleeping
Sleeping
| import os | |
| import base64 | |
| from google import genai | |
| from google.genai import types | |
| from typing import Optional, Dict, Any | |
| def scrape_with_vision(screenshot_bytes: bytes, url: str) -> Optional[Dict[str, Any]]: | |
| """ | |
| Uses Gemini 2.0 Flash (Vision) to transcribe a screenshot of an article. | |
| Migrated to new google.genai SDK. | |
| """ | |
| api_key = os.environ.get("GEMINI_API_KEY") | |
| if not api_key: | |
| return None | |
| try: | |
| client = genai.Client(api_key=api_key) | |
| prompt = """ | |
| You are a visual web scraper. | |
| Your task is to transcribe the article in this screenshot into Markdown format. | |
| Rules: | |
| 1. Extract the Title, Author, and Full Content. | |
| 2. Preserve all headers, code blocks, and formatting. | |
| 3. Ignore ads, sidebars, and navigation menus. | |
| 4. If the content is truncated or behind a paywall in the image, transcribe what is visible and add a note. | |
| 5. Output ONLY the markdown. | |
| """ | |
| # Create image part for new SDK | |
| image_part = types.Part.from_bytes( | |
| data=screenshot_bytes, | |
| mime_type="image/png" | |
| ) | |
| response = client.models.generate_content( | |
| model='gemini-2.0-flash-exp', | |
| contents=[prompt, image_part] | |
| ) | |
| markdown = response.text | |
| # Basic parsing of the markdown to get title/author if possible | |
| title = "Unknown Title" | |
| lines = markdown.split("\n") | |
| for line in lines: | |
| if line.startswith("# "): | |
| title = line.replace("# ", "").strip() | |
| break | |
| return { | |
| "url": url, | |
| "title": title, | |
| "author": {"name": "Unknown (Vision Extracted)"}, | |
| "markdownContent": markdown, | |
| "source": "vision" | |
| } | |
| except Exception as e: | |
| print(f"Vision Scraping Failed: {e}") | |
| return None | |
| def extract_chart_data(image_url: str) -> Optional[str]: | |
| """ | |
| Downloads a chart image and converts it to CSV data using Gemini Vision. | |
| Migrated to new google.genai SDK. | |
| """ | |
| api_key = os.environ.get("GEMINI_API_KEY") | |
| if not api_key: | |
| return None | |
| try: | |
| import httpx | |
| # Download image | |
| with httpx.Client() as client: | |
| resp = client.get(image_url) | |
| if resp.status_code != 200: | |
| return None | |
| image_bytes = resp.content | |
| client = genai.Client(api_key=api_key) | |
| prompt = """ | |
| Analyze this chart image. | |
| Extract the underlying data and output it as a CSV string. | |
| Do not include any other text, just the CSV. | |
| """ | |
| # Create image part for new SDK | |
| image_part = types.Part.from_bytes( | |
| data=image_bytes, | |
| mime_type="image/png" | |
| ) | |
| response = client.models.generate_content( | |
| model='gemini-2.0-flash-exp', | |
| contents=[prompt, image_part] | |
| ) | |
| return response.text.strip() | |
| except Exception as e: | |
| print(f"Chart Extraction Failed: {e}") | |
| return None | |