#!/usr/bin/env python3 """ OCR Extrator - Extrai texto de screenshots do Taobao usando PaddleOCR """ import json import re import sys from pathlib import Path from paddleocr import PaddleOCR # Inicializa OCR (chinês + inglês) ocr = PaddleOCR(lang='ch') def extract_text_from_image(image_path): """Extrai todo o texto da imagem""" result = ocr.ocr(str(image_path)) texts = [] if result and result[0]: for line in result[0]: if line: box, (text, confidence) = line if confidence > 0.5: # Só confia em > 50% texts.append({ 'text': text, 'confidence': confidence, 'box': box }) return texts def extract_products(texts): """Extrai produtos do texto OCR""" products = [] # Padrões price_pattern = r'[¥¥]\s*(\d+[.,]\d+)' number_pattern = r'(\d{1,5})[.,](\d{2})' for i, item in enumerate(texts): text = item['text'] # Procura preço price_match = re.search(price_pattern, text) if price_match: price_str = price_match.group(1).replace(',', '.') try: price = float(price_str) if 10 < price < 10000: # Preço razoável # Título pode estar nas linhas anteriores title_lines = [] for j in range(max(0, i-3), i): prev_text = texts[j]['text'] if len(prev_text) > 3 and not re.search(price_pattern, prev_text): title_lines.append(prev_text) title = ' '.join(title_lines) if title_lines else text products.append({ 'title': title[:100], 'price': price, 'raw_text': text, 'confidence': item['confidence'] }) except: pass return products def main(): screenshot_dir = Path("/tmp/taobao_results") output_file = Path("/tmp/taobao_products_ocr.json") print("=" * 60) print(" TAOBAO OCR EXTRACTOR") print("=" * 60) screenshots = sorted(screenshot_dir.glob("*.png")) if not screenshots: print("Nenhum screenshot encontrado") return all_products = [] for img_path in screenshots: print(f"\n🔍 Processando: {img_path.name}") texts = extract_text_from_image(img_path) print(f" Textos extraídos: {len(texts)}") # Mostra alguns textos for t in texts[:5]: print(f" - {t['text'][:50]} (conf: {t['confidence']:.2f})") products = extract_products(texts) print(f" Produtos: {len(products)}") all_products.extend(products) # Salva result = { 'total_products': len(all_products), 'products': all_products } with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) print(f"\n✅ Salvo: {output_file}") print(f"📦 Total de produtos: {len(all_products)}") if __name__ == "__main__": main()