Spaces:
Sleeping
Sleeping
| import json | |
| from PIL import Image | |
| import pytesseract | |
| import pandas as pd | |
| import os | |
| from pathlib import Path | |
| import logging | |
| # Configurar logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.FileHandler('hawb_processing.log'), | |
| logging.StreamHandler() | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def load_field_areas(coordinates_json): | |
| """Carga y procesa las coordenadas desde el archivo JSON""" | |
| with open(coordinates_json, 'r') as f: | |
| data = json.load(f) | |
| field_areas = {} | |
| for box in data['boxes']: | |
| x = float(box['x']) | |
| y = float(box['y']) | |
| width = float(box['width']) | |
| height = float(box['height']) | |
| field_areas[box['label']] = { | |
| "x1": int(x - width/2), | |
| "y1": int(y - height/2), | |
| "x2": int(x + width/2), | |
| "y2": int(y + height/2) | |
| } | |
| return field_areas, data['width'], data['height'] | |
| def extract_text_from_area(image, area, margin=10): | |
| """Extrae texto de un área específica de la imagen con margen de tolerancia""" | |
| x1 = max(0, area["x1"] - margin) | |
| y1 = max(0, area["y1"] - margin) | |
| x2 = min(image.width, area["x2"] + margin) | |
| y2 = min(image.height, area["y2"] + margin) | |
| crop = image.crop((x1, y1, x2, y2)) | |
| custom_config = r'--oem 3 --psm 6' | |
| text = pytesseract.image_to_string(crop, lang='eng', config=custom_config).strip() | |
| return text | |
| def process_hawb(image_path, coordinates_json, margin=5): | |
| """Procesa una imagen de HAWB y extrae los campos principales""" | |
| logger.info(f"Procesando HAWB: {image_path}") | |
| image = Image.open(image_path) | |
| field_areas, img_width, img_height = load_field_areas(coordinates_json) | |
| # Ajustar imagen si es necesario | |
| if image.size != (img_width, img_height): | |
| image = image.resize((img_width, img_height)) | |
| # Extraer campos principales | |
| campos = ["Shipper_name", "hawb_number", "hawb_date", "number_pieces", "gross_weight", "kg_lb"] | |
| extracted = {"filename": os.path.basename(image_path)} | |
| for campo in campos: | |
| if campo in field_areas: | |
| extracted[campo] = extract_text_from_area(image, field_areas[campo], margin) | |
| else: | |
| extracted[campo] = "" | |
| return extracted | |
| def process_hawb_batch(image_paths, coordinates_json): | |
| """Procesa un lote de imágenes de HAWB""" | |
| logger.info(f"Iniciando procesamiento de {len(image_paths)} HAWB") | |
| results = [] | |
| for image_path in image_paths: | |
| try: | |
| result = process_hawb(image_path, coordinates_json) | |
| results.append(result) | |
| except Exception as e: | |
| logger.error(f"Error procesando {image_path}: {str(e)}") | |
| results.append({'filename': os.path.basename(image_path)}) | |
| df = pd.DataFrame(results) | |
| return df | |
| def main(hawb_dir="./hawb", data_dir="./data", coordinates_json="./coordinates_HAWB.json"): | |
| Path(data_dir).mkdir(parents=True, exist_ok=True) | |
| image_paths = [ | |
| os.path.join(hawb_dir, f) | |
| for f in os.listdir(hawb_dir) | |
| if f.lower().endswith(('.jpg', '.jpeg', '.png')) | |
| ] | |
| df = process_hawb_batch(image_paths, coordinates_json) | |
| return df | |
| if __name__ == "__main__": | |
| main() | |