meccatronis
/

taobao_scraper

Model card Files Files and versions

xet

Community

meccatronis commited on Feb 11

Commit

72cccfb

verified ·

1 Parent(s): 8d343ee

Upload vision_analyzer.py with huggingface_hub

Browse files

Files changed (1) hide show

vision_analyzer.py +172 -0

vision_analyzer.py ADDED Viewed

	@@ -0,0 +1,172 @@

+#!/usr/bin/env python3
+"""
+Vision Analyzer para Taobao
+Recebe screenshot e extrai informações de produtos usando visão computacional
+"""
+import sys
+import json
+import re
+import subprocess
+from pathlib import Path
+from datetime import datetime
+# Configurações
+SCREENSHOT_DIR = Path("/tmp/taobao_screenshots")
+OUTPUT_FILE = Path("/tmp/taobao_products.json")
+# Padrões para extração de preços e títulos
+PRICE_PATTERNS = [
+    r'[¥￥]\s*(\d+[.,]\d+)',           # ¥123.45
+    r'(\d{1,5})[.,](\d{2})\s*元',      # 123.45元
+    r'RMB\s*(\d+[.,]\d+)',              # RMB 123.45
+    r'￥(\d+)',                         # ￥123
+]
+# Elementos visuais típicos do Taobao
+TAOBAO_INDICATORS = [
+    '淘宝', 'Taobao', 'tmall', '天猫', '店铺', '宝贝',
+    '月销', '评价', '包邮', '券', '元', '¥', '￥'
+]
+def take_screenshot(output_path=None):
+    """Tira screenshot da tela"""
+    if output_path is None:
+        output_path = SCREENSHOT_DIR / f"screen_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
+    SCREENSHOT_DIR.mkdir(parents=True, exist_ok=True)
+    # Get screen resolution
+    result = subprocess.run(['xrandr'], capture_output=True, text=True)
+    match = re.search(r'(\d+)x(\d+).*\*', result.stdout)
+    if match:
+        width, height = match.groups()
+    else:
+        width, height = "1360", "768"
+    subprocess.run([
+        "ffmpeg", "-f", "x11grab",
+        "-video_size", f"{width}x{height}",
+        "-i", ":0",
+        "-frames:v", "1",
+        "-y", str(output_path)
+    ], capture_output=True, timeout=10)
+    return output_path
+class TaobaoProductExtractor:
+    """Extrai produtos de screenshot do Taobao"""
+    def __init__(self):
+        self.products = []
+    def extract_price_from_text(self, text):
+        """Extrai preço de texto"""
+        for pattern in PRICE_PATTERNS:
+            match = re.search(pattern, text)
+            if match:
+                try:
+                    price_str = match.group(1) if match.lastindex >= 1 else match.group(0)
+                    price_str = price_str.replace(',', '.').replace('¥', '').replace('￥', '').replace('RMB', '').strip()
+                    return float(price_str)
+                except:
+                    continue
+        return None
+    def is_taobao_page(self, text):
+        """Verifica se é página do Taobao"""
+        return any(indicator in text for indicator in TAOBAO_INDICATORS)
+    def parse_products_from_text(self, text_lines):
+        """Parse produtos de linhas de texto"""
+        products = []
+        for i, line in enumerate(text_lines):
+            line = line.strip()
+            # Procura preço
+            price = self.extract_price_from_text(line)
+            if price:
+                # Linhas anteriores podem ser o título
+                title_lines = []
+                for j in range(max(0, i-3), i):
+                    prev_line = text_lines[j].strip()
+                    if prev_line and len(prev_line) > 5:
+                        # Não é um preço nem indicador de UI
+                        if not self.extract_price_from_text(prev_line) and len(prev_line) < 80:
+                            title_lines.append(prev_line)
+                title = ' '.join(title_lines) if title_lines else f"Produto {len(products)+1}"
+                products.append({
+                    'title': title,
+                    'price': price,
+                    'raw_line': line
+                })
+        return products
+    def analyze_with_vision(self, image_path):
+        """Analisa imagem usando visão (placeholder para API de visão)"""
+        print(f"🔍 Analisando: {image_path}")
+        print("💡 Esta imagem pode ser interpretada pela API de visão")
+        print(f"   Caminho: {image_path}")
+        # Retorna instrução para uso
+        return {
+            'image_path': str(image_path),
+            'needs_vision_api': True,
+            'instruction': 'Use GPT-4V ou Claude Vision para extrair produtos desta imagem'
+        }
+def analyze_screenshot(image_path):
+    """Analisa screenshot e extrai produtos"""
+    extractor = TaobaoProductExtractor()
+    # Simula análise - em produção usaria OCR real ou Vision API
+    result = extractor.analyze_with_vision(image_path)
+    return result
+def main():
+    """Fluxo principal"""
+    import argparse
+    parser = argparse.ArgumentParser(description='Analyze Taobao screenshots')
+    parser.add_argument('--screenshot', '-s', help='Path to screenshot file')
+    parser.add_argument('--take', '-t', action='store_true', help='Take a new screenshot')
+    parser.add_argument('--output', '-o', default=str(OUTPUT_FILE), help='Output JSON file')
+    args = parser.parse_args()
+    print("=" * 60)
+    print("  TAOBAO VISION ANALYZER")
+    print("=" * 60)
+    if args.take:
+        print("\n📸 Tirando screenshot...")
+        img_path = take_screenshot()
+        print(f"✅ Screenshot salvo: {img_path}")
+        result = analyze_screenshot(img_path)
+        with open(args.output, 'w') as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+        print(f"📄 Resultado salvo: {args.output}")
+    elif args.screenshot:
+        result = analyze_screenshot(args.screenshot)
+        with open(args.output, 'w') as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+        print(f"📄 Resultado salvo: {args.output}")
+    else:
+        print("\nUso:")
+        print("  python vision_analyzer.py --take        # Tira screenshot e analisa")
+        print("  python vision_analyzer.py -s /path/to/img.png  # Analiza imagem existente")
+if __name__ == "__main__":
+    main()