taobao_scraper / extract_ocr.py
meccatronis's picture
Upload extract_ocr.py with huggingface_hub
2ce8659 verified
#!/usr/bin/env python3
"""
OCR Extrator - Extrai texto de screenshots do Taobao usando PaddleOCR
"""
import json
import re
import sys
from pathlib import Path
from paddleocr import PaddleOCR
# Inicializa OCR (chinês + inglês)
ocr = PaddleOCR(lang='ch')
def extract_text_from_image(image_path):
"""Extrai todo o texto da imagem"""
result = ocr.ocr(str(image_path))
texts = []
if result and result[0]:
for line in result[0]:
if line:
box, (text, confidence) = line
if confidence > 0.5: # Só confia em > 50%
texts.append({
'text': text,
'confidence': confidence,
'box': box
})
return texts
def extract_products(texts):
"""Extrai produtos do texto OCR"""
products = []
# Padrões
price_pattern = r'[¥¥]\s*(\d+[.,]\d+)'
number_pattern = r'(\d{1,5})[.,](\d{2})'
for i, item in enumerate(texts):
text = item['text']
# Procura preço
price_match = re.search(price_pattern, text)
if price_match:
price_str = price_match.group(1).replace(',', '.')
try:
price = float(price_str)
if 10 < price < 10000: # Preço razoável
# Título pode estar nas linhas anteriores
title_lines = []
for j in range(max(0, i-3), i):
prev_text = texts[j]['text']
if len(prev_text) > 3 and not re.search(price_pattern, prev_text):
title_lines.append(prev_text)
title = ' '.join(title_lines) if title_lines else text
products.append({
'title': title[:100],
'price': price,
'raw_text': text,
'confidence': item['confidence']
})
except:
pass
return products
def main():
screenshot_dir = Path("/tmp/taobao_results")
output_file = Path("/tmp/taobao_products_ocr.json")
print("=" * 60)
print(" TAOBAO OCR EXTRACTOR")
print("=" * 60)
screenshots = sorted(screenshot_dir.glob("*.png"))
if not screenshots:
print("Nenhum screenshot encontrado")
return
all_products = []
for img_path in screenshots:
print(f"\n🔍 Processando: {img_path.name}")
texts = extract_text_from_image(img_path)
print(f" Textos extraídos: {len(texts)}")
# Mostra alguns textos
for t in texts[:5]:
print(f" - {t['text'][:50]} (conf: {t['confidence']:.2f})")
products = extract_products(texts)
print(f" Produtos: {len(products)}")
all_products.extend(products)
# Salva
result = {
'total_products': len(all_products),
'products': all_products
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"\n✅ Salvo: {output_file}")
print(f"📦 Total de produtos: {len(all_products)}")
if __name__ == "__main__":
main()