| |
| """ |
| OCR Extrator - Extrai texto de screenshots do Taobao usando PaddleOCR |
| """ |
|
|
| import json |
| import re |
| import sys |
| from pathlib import Path |
| from paddleocr import PaddleOCR |
|
|
| |
| ocr = PaddleOCR(lang='ch') |
|
|
| def extract_text_from_image(image_path): |
| """Extrai todo o texto da imagem""" |
| result = ocr.ocr(str(image_path)) |
|
|
| texts = [] |
| if result and result[0]: |
| for line in result[0]: |
| if line: |
| box, (text, confidence) = line |
| if confidence > 0.5: |
| texts.append({ |
| 'text': text, |
| 'confidence': confidence, |
| 'box': box |
| }) |
|
|
| return texts |
|
|
| def extract_products(texts): |
| """Extrai produtos do texto OCR""" |
| products = [] |
|
|
| |
| price_pattern = r'[¥¥]\s*(\d+[.,]\d+)' |
| number_pattern = r'(\d{1,5})[.,](\d{2})' |
|
|
| for i, item in enumerate(texts): |
| text = item['text'] |
|
|
| |
| price_match = re.search(price_pattern, text) |
| if price_match: |
| price_str = price_match.group(1).replace(',', '.') |
| try: |
| price = float(price_str) |
| if 10 < price < 10000: |
| |
| title_lines = [] |
| for j in range(max(0, i-3), i): |
| prev_text = texts[j]['text'] |
| if len(prev_text) > 3 and not re.search(price_pattern, prev_text): |
| title_lines.append(prev_text) |
|
|
| title = ' '.join(title_lines) if title_lines else text |
|
|
| products.append({ |
| 'title': title[:100], |
| 'price': price, |
| 'raw_text': text, |
| 'confidence': item['confidence'] |
| }) |
| except: |
| pass |
|
|
| return products |
|
|
| def main(): |
| screenshot_dir = Path("/tmp/taobao_results") |
| output_file = Path("/tmp/taobao_products_ocr.json") |
|
|
| print("=" * 60) |
| print(" TAOBAO OCR EXTRACTOR") |
| print("=" * 60) |
|
|
| screenshots = sorted(screenshot_dir.glob("*.png")) |
|
|
| if not screenshots: |
| print("Nenhum screenshot encontrado") |
| return |
|
|
| all_products = [] |
|
|
| for img_path in screenshots: |
| print(f"\n🔍 Processando: {img_path.name}") |
|
|
| texts = extract_text_from_image(img_path) |
| print(f" Textos extraídos: {len(texts)}") |
|
|
| |
| for t in texts[:5]: |
| print(f" - {t['text'][:50]} (conf: {t['confidence']:.2f})") |
|
|
| products = extract_products(texts) |
| print(f" Produtos: {len(products)}") |
|
|
| all_products.extend(products) |
|
|
| |
| result = { |
| 'total_products': len(all_products), |
| 'products': all_products |
| } |
|
|
| with open(output_file, 'w', encoding='utf-8') as f: |
| json.dump(result, f, indent=2, ensure_ascii=False) |
|
|
| print(f"\n✅ Salvo: {output_file}") |
| print(f"📦 Total de produtos: {len(all_products)}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|