Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from datasets import load_dataset | |
| import json | |
| import ast # <--- Added for robust parsing | |
| # --- 1. Load the dataset --- | |
| print("📥 Loading 'mychen76/invoices-and-receipts_ocr_v1' from Hugging Face...") | |
| try: | |
| dataset = load_dataset("mychen76/invoices-and-receipts_ocr_v1", split='train') | |
| print("✅ Dataset loaded successfully!") | |
| except Exception as e: | |
| print(f"❌ Failed to load dataset. Error: {e}") | |
| exit() | |
| # --- 2. Print Dataset Information --- | |
| print("\n" + "="*60) | |
| print("📊 DATASET INFORMATION & FEATURES") | |
| print("="*60) | |
| print(f"Number of examples: {len(dataset)}") | |
| print(f"\nFeatures (Columns): {dataset.features}") | |
| # --- 3. Explore a Single Example --- | |
| print("\n" + "="*60) | |
| print("📄 EXPLORING THE FIRST SAMPLE") | |
| print("="*60) | |
| if len(dataset) > 0: | |
| sample = dataset[0] | |
| # Parse the main wrapper JSONs | |
| try: | |
| raw_data = json.loads(sample['raw_data']) | |
| parsed_data = json.loads(sample['parsed_data']) | |
| except json.JSONDecodeError as e: | |
| print(f"❌ Error decoding main JSON wrappers: {e}") | |
| exit() | |
| print(f"\nImage object: {sample['image']}") | |
| # --- ROBUST PARSING LOGIC --- | |
| def safe_parse(content): | |
| """Try JSON, fallback to AST (for single quotes)""" | |
| if isinstance(content, list): | |
| return content # Already a list | |
| if isinstance(content, str): | |
| try: | |
| return json.loads(content) | |
| except json.JSONDecodeError: | |
| try: | |
| return ast.literal_eval(content) | |
| except: | |
| return None | |
| return None | |
| ocr_words = safe_parse(raw_data.get('ocr_words')) | |
| ocr_boxes = safe_parse(raw_data.get('ocr_boxes')) | |
| if ocr_words and ocr_boxes: | |
| print(f"\nFound {len(ocr_words)} OCR words.") | |
| print("Sample Word & Box Format:") | |
| # Print first 3 to check coordinate format (4 numbers or 8 numbers?) | |
| for i in range(min(3, len(ocr_words))): | |
| print(f" Word: '{ocr_words[i]}' | Box: {ocr_boxes[i]}") | |
| else: | |
| print("❌ OCR fields missing or could not be parsed.") | |
| else: | |
| print("Dataset is empty.") | |
| # --- 4. Discover All Unique NER Tags --- | |
| print("\n" + "="*60) | |
| print("📋 ALL UNIQUE ENTITY LABELS IN THIS DATASET") | |
| print("="*60) | |
| if len(dataset) > 0: | |
| all_entity_labels = set() | |
| print("Scanning dataset for labels...") | |
| for i, example in enumerate(dataset): | |
| try: | |
| # Parse parsed_data | |
| parsed_example = json.loads(example['parsed_data']) | |
| # The 'json' field inside might be a string or a dict | |
| fields_data = parsed_example.get('json', {}) | |
| if isinstance(fields_data, str): | |
| try: | |
| fields = json.loads(fields_data) | |
| except: | |
| fields = ast.literal_eval(fields_data) | |
| else: | |
| fields = fields_data | |
| if fields: | |
| all_entity_labels.update(fields.keys()) | |
| except Exception: | |
| continue # Skip corrupted examples silently | |
| if all_entity_labels: | |
| print(f"\nFound {len(all_entity_labels)} unique entity labels:") | |
| print(sorted(list(all_entity_labels))) | |
| else: | |
| print("Could not find any entity labels.") | |
| else: | |
| print("Cannot analyze tags of an empty dataset.") | |
| # Add this to explore_new_dataset.py | |
| sample = dataset[0] | |
| sample['image'].save("data/samples/test_invoice_no.jpg") | |
| print("Saved sample image to data/samples/test_invoice_no.jpg") |