meccatronis
/

taobao_scraper

Model card Files Files and versions

taobao_scraper / extract_ocr.py

meccatronis's picture

Upload extract_ocr.py with huggingface_hub

2ce8659 verified 4 months ago

history blame contribute delete

3.25 kB

	#!/usr/bin/env python3
	"""
	OCR Extrator - Extrai texto de screenshots do Taobao usando PaddleOCR
	"""

	import json
	import re
	import sys
	from pathlib import Path
	from paddleocr import PaddleOCR

	# Inicializa OCR (chinês + inglês)
	ocr = PaddleOCR(lang='ch')

	def extract_text_from_image(image_path):
	"""Extrai todo o texto da imagem"""
	result = ocr.ocr(str(image_path))

	texts = []
	if result and result[0]:
	for line in result[0]:
	if line:
	box, (text, confidence) = line
	if confidence > 0.5: # Só confia em > 50%
	texts.append({
	'text': text,
	'confidence': confidence,
	'box': box
	})

	return texts

	def extract_products(texts):
	"""Extrai produtos do texto OCR"""
	products = []

	# Padrões
	price_pattern = r'[¥￥]\s*(\d+[.,]\d+)'
	number_pattern = r'(\d{1,5})[.,](\d{2})'

	for i, item in enumerate(texts):
	text = item['text']

	# Procura preço
	price_match = re.search(price_pattern, text)
	if price_match:
	price_str = price_match.group(1).replace(',', '.')
	try:
	price = float(price_str)
	if 10 < price < 10000: # Preço razoável
	# Título pode estar nas linhas anteriores
	title_lines = []
	for j in range(max(0, i-3), i):
	prev_text = texts[j]['text']
	if len(prev_text) > 3 and not re.search(price_pattern, prev_text):
	title_lines.append(prev_text)

	title = ' '.join(title_lines) if title_lines else text

	products.append({
	'title': title[:100],
	'price': price,
	'raw_text': text,
	'confidence': item['confidence']
	})
	except:
	pass

	return products

	def main():
	screenshot_dir = Path("/tmp/taobao_results")
	output_file = Path("/tmp/taobao_products_ocr.json")

	print("=" * 60)
	print(" TAOBAO OCR EXTRACTOR")
	print("=" * 60)

	screenshots = sorted(screenshot_dir.glob("*.png"))

	if not screenshots:
	print("Nenhum screenshot encontrado")
	return

	all_products = []

	for img_path in screenshots:
	print(f"\n🔍 Processando: {img_path.name}")

	texts = extract_text_from_image(img_path)
	print(f" Textos extraídos: {len(texts)}")

	# Mostra alguns textos
	for t in texts[:5]:
	print(f" - {t['text'][:50]} (conf: {t['confidence']:.2f})")

	products = extract_products(texts)
	print(f" Produtos: {len(products)}")

	all_products.extend(products)

	# Salva
	result = {
	'total_products': len(all_products),
	'products': all_products
	}

	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(result, f, indent=2, ensure_ascii=False)

	print(f"\n✅ Salvo: {output_file}")
	print(f"📦 Total de produtos: {len(all_products)}")

	if __name__ == "__main__":
	main()