Spaces:

redhairedshanks1
/

dots-ocr

Runtime error

App Files Files Community

dots-ocr / dots_ocr /utils /output_cleaner.py

redhairedshanks1

Upload 61 files

b56e481 verified 9 months ago

raw

history blame contribute delete

26.3 kB

	#!/usr/bin/env python3
	"""
	Data Cleaning Script - Cleans all data using a simplified regex method and saves the results

	Features:
	1. Cleans all cases using a simplified regex method.
	2. Saves the cleaned data for each case.
	3. Ensures the relative order of dicts remains unchanged.
	4. Generates a before-and-after cleaning report.
	"""

	import json
	import re
	import os
	from typing import Dict, List, Tuple, Optional, Any
	from dataclasses import dataclass
	from collections import Counter
	import traceback


	@dataclass
	class CleanedData:
	"""Data structure for cleaned data"""
	case_id: int
	original_type: str # 'list' or 'str'
	original_length: int
	cleaned_data: List[Dict]
	cleaning_operations: Dict[str, Any] # Records the cleaning operations performed
	success: bool


	class OutputCleaner:
	"""Data Cleaner - Based on a simplified regex method"""

	def __init__(self):
	# Simplified regular expression patterns
	self.dict_pattern = re.compile(r'\{[^{}]?"bbox"\s:\s\[[^\]]?\][^{}]*?\}', re.DOTALL)
	self.bbox_pattern = re.compile(r'"bbox"\s:\s\[([^\]]+)\]')
	self.missing_delimiter_pattern = re.compile(r'\}\s*\{(?!")')

	self.cleaned_results: List[CleanedData] = []

	def clean_list_data(self, data: List[Dict], case_id: int) -> CleanedData:
	"""Cleans list-type data"""

	print(f"🔧 Cleaning List data - Case {case_id}")
	print(f" Original items: {len(data)}")

	cleaned_data = []
	operations = {
	'type': 'list',
	'bbox_fixes': 0,
	'removed_items': 0,
	'original_count': len(data)
	}

	for i, item in enumerate(data):
	if not isinstance(item, dict):
	operations['removed_items'] += 1
	continue

	# Check the bbox field
	if 'bbox' in item:
	bbox = item['bbox']

	# Check bbox length - core logic
	if isinstance(bbox, list) and len(bbox) == 3:
	print(f" ⚠️ Item {i}: bbox has only 3 coordinates. Removing bbox, keeping category and text.")
	# Keep only category and text, ensuring order is preserved
	new_item = {}
	if 'category' in item:
	new_item['category'] = item['category']
	if 'text' in item:
	new_item['text'] = item['text']
	if new_item: # Add only if there is valid content
	cleaned_data.append(new_item)
	operations['bbox_fixes'] += 1
	else:
	operations['removed_items'] += 1
	continue
	elif isinstance(bbox, list) and len(bbox) == 4:
	# bbox is normal, add directly, preserving original order
	cleaned_data.append(item.copy())
	continue
	else:
	print(f" ❌ Item {i}: Abnormal bbox format, skipping.")
	operations['removed_items'] += 1
	continue
	else:
	# No bbox field, keep if category exists
	if 'category' in item:
	cleaned_data.append(item.copy())
	continue
	else:
	operations['removed_items'] += 1

	operations['final_count'] = len(cleaned_data)
	print(f" ✅ Cleaning complete: {len(cleaned_data)} items, {operations['bbox_fixes']} bbox fixes, {operations['removed_items']} items removed")

	return CleanedData(
	case_id=case_id,
	original_type='list',
	original_length=len(data),
	cleaned_data=cleaned_data,
	cleaning_operations=operations,
	success=True
	)

	def clean_string_data(self, data_str: str, case_id: int) -> CleanedData:
	"""Cleans string-type data"""

	print(f"🔧 Cleaning String data - Case {case_id}")
	print(f" Original length: {len(data_str):,}")

	operations = {
	'type': 'str',
	'original_length': len(data_str),
	'delimiter_fixes': 0,
	'tail_truncated': False,
	'truncated_length': 0,
	'duplicate_dicts_removed': 0,
	'final_objects': 0
	}

	try:
	# Step 1: Detect and fix missing delimiters
	data_str, delimiter_fixes = self._fix_missing_delimiters(data_str)
	operations['delimiter_fixes'] = delimiter_fixes

	# Step 2: Truncate the last incomplete element
	data_str, tail_truncated = self._truncate_last_incomplete_element(data_str)
	operations['tail_truncated'] = tail_truncated
	operations['truncated_length'] = len(data_str)

	# Step 3: Remove duplicate complete dict objects, preserving order
	data_str, duplicate_removes = self._remove_duplicate_complete_dicts_preserve_order(data_str)
	operations['duplicate_dicts_removed'] = duplicate_removes

	# Step 4: Ensure correct JSON format
	data_str = self._ensure_json_format(data_str)

	# Step 5: Try to parse the final result
	final_data = self._parse_final_json(data_str)

	if final_data is not None:
	operations['final_objects'] = len(final_data)
	print(f" ✅ Cleaning complete: {len(final_data)} objects")

	return CleanedData(
	case_id=case_id,
	original_type='str',
	original_length=operations['original_length'],
	cleaned_data=final_data,
	cleaning_operations=operations,
	success=True
	)
	else:
	raise Exception("Could not parse the cleaned data")

	except Exception as e:
	print(f" ❌ Cleaning failed: {e}")
	return CleanedData(
	case_id=case_id,
	original_type='str',
	original_length=operations['original_length'],
	cleaned_data=[],
	cleaning_operations=operations,
	success=False
	)

	def _fix_missing_delimiters(self, text: str) -> Tuple[str, int]:
	"""Fixes missing delimiters"""

	fixes = 0

	def replace_delimiter(match):
	nonlocal fixes
	fixes += 1
	return '},{'

	text = self.missing_delimiter_pattern.sub(replace_delimiter, text)

	if fixes > 0:
	print(f" ✅ Fixed {fixes} missing delimiters")

	return text, fixes

	def _truncate_last_incomplete_element(self, text: str) -> Tuple[str, bool]:
	"""Truncates the last incomplete element"""

	# For very long text (>50k) or text not ending with ']', directly truncate the last '{"bbox":'
	needs_truncation = (
	len(text) > 50000 or
	not text.strip().endswith(']')
	)

	if needs_truncation:
	# Check how many dict objects there are
	bbox_count = text.count('{"bbox":')

	# If there is only one dict object, do not truncate to avoid deleting the only object
	if bbox_count <= 1:
	print(f" ⚠️ Only {bbox_count} dict objects found, skipping truncation to avoid deleting all content")
	return text, False

	# Find the position of the last '{"bbox":'
	last_bbox_pos = text.rfind('{"bbox":')

	if last_bbox_pos > 0:
	# Truncate before this position
	truncated_text = text[:last_bbox_pos].rstrip()

	# Remove trailing comma
	if truncated_text.endswith(','):
	truncated_text = truncated_text[:-1]

	print(f" ✂️ Truncated the last incomplete element, length reduced from {len(text):,} to {len(truncated_text):,}")
	return truncated_text, True

	return text, False

	def _remove_duplicate_complete_dicts_preserve_order(self, text: str) -> Tuple[str, int]:
	"""Removes duplicate complete dict objects, preserving original order"""

	# Extract all dict objects, preserving order
	dict_matches = list(self.dict_pattern.finditer(text))

	if not dict_matches:
	return text, 0

	print(f" 📊 Found {len(dict_matches)} dict objects")

	# Deduplication while preserving order: only keep the first occurrence of a dict
	unique_dicts = []
	seen_dict_strings = set()
	total_duplicates = 0

	for match in dict_matches:
	dict_str = match.group()

	if dict_str not in seen_dict_strings:
	unique_dicts.append(dict_str)
	seen_dict_strings.add(dict_str)
	else:
	total_duplicates += 1

	if total_duplicates > 0:
	# Reconstruct the JSON array, preserving the original order
	new_text = '[' + ', '.join(unique_dicts) + ']'
	print(f" ✅ Removed {total_duplicates} duplicate dicts, keeping {len(unique_dicts)} unique dicts (order preserved)")
	return new_text, total_duplicates
	else:
	print(f" ✅ No duplicate dict objects found")
	return text, 0

	def _ensure_json_format(self, text: str) -> str:
	"""Ensures correct JSON format"""

	text = text.strip()

	if not text.startswith('['):
	text = '[' + text

	if not text.endswith(']'):
	# Remove trailing comma
	text = text.rstrip(',').rstrip()
	text += ']'

	return text

	def _parse_final_json(self, text: str) -> Optional[List[Dict]]:
	"""Tries to parse the final JSON"""

	try:
	data = json.loads(text)
	if isinstance(data, list):
	return data
	except json.JSONDecodeError as e:
	print(f" ❌ JSON parsing failed: {e}")

	# fallback1: Extract valid dict objects
	valid_dicts = []

	for match in self.dict_pattern.finditer(text):
	dict_str = match.group()
	try:
	dict_obj = json.loads(dict_str)
	valid_dicts.append(dict_obj)
	except:
	continue

	if valid_dicts:
	print(f" ✅ Extracted {len(valid_dicts)} valid dicts")
	return valid_dicts

	# fallback2: Special handling for a single incomplete dict
	return self._handle_single_incomplete_dict(text)

	return None

	def _handle_single_incomplete_dict(self, text: str) -> Optional[List[Dict]]:
	"""Handles the special case of a single incomplete dict"""

	# Check if it's a single incomplete dict case
	if not text.strip().startswith('[{"bbox":'):
	return None

	try:
	# Try to extract bbox coordinates
	bbox_match = re.search(r'"bbox"\s:\s\[([^\]]+)\]', text)
	if not bbox_match:
	return None

	bbox_str = bbox_match.group(1)
	bbox_coords = [int(x.strip()) for x in bbox_str.split(',')]

	if len(bbox_coords) != 4:
	return None

	# Try to extract category
	category_match = re.search(r'"category"\s:\s"([^"]+)"', text)
	category = category_match.group(1) if category_match else "Text"

	# Try to extract the beginning of the text (first 10000 characters)
	text_match = re.search(r'"text"\s:\s"([^"]{0,10000})', text)
	if text_match:
	text_content = text_match.group(1)
	else:
	text_content = ""

	# Construct the fixed dict
	fixed_dict = {
	"bbox": bbox_coords,
	"category": category
	}

	if text_content:
	fixed_dict["text"] = text_content

	print(f" 🔧 Special fix: single incomplete dict → {fixed_dict}")
	return [fixed_dict]

	except Exception as e:
	print(f" ❌ Special fix failed: {e}")
	return None

	def remove_duplicate_category_text_pairs_and_bbox(self, data_list: List[dict], case_id: int) -> List[dict]:
	"""Removes duplicate category-text pairs and duplicate bboxes"""

	if not data_list or len(data_list) <= 1:
	print(f" 📊 Data length {len(data_list)} <= 1, skipping deduplication check")
	return data_list

	print(f" 📊 Original data length: {len(data_list)}")

	# 1. Count occurrences and positions of each category-text pair
	category_text_pairs = {}
	for i, item in enumerate(data_list):
	if isinstance(item, dict) and 'category' in item and 'text' in item:
	pair_key = (item.get('category', ''), item.get('text', ''))
	if pair_key not in category_text_pairs:
	category_text_pairs[pair_key] = []
	category_text_pairs[pair_key].append(i)

	# 2. Count occurrences and positions of each bbox
	bbox_pairs = {}
	for i, item in enumerate(data_list):
	if isinstance(item, dict) and 'bbox' in item:
	bbox = item.get('bbox')
	if isinstance(bbox, list) and len(bbox) > 0:
	bbox_key = tuple(bbox) # Convert to tuple to use as a dictionary key
	if bbox_key not in bbox_pairs:
	bbox_pairs[bbox_key] = []
	bbox_pairs[bbox_key].append(i)

	# 3. Identify items to be removed
	duplicates_to_remove = set()

	# 3a. Process category-text pairs that appear 5 or more times
	for pair_key, positions in category_text_pairs.items():
	if len(positions) >= 5:
	category, text = pair_key
	# Keep the first occurrence, remove subsequent duplicates
	positions_to_remove = positions[1:]
	duplicates_to_remove.update(positions_to_remove)

	print(f" 🔍 Found duplicate category-text pair: category='{category}', first 50 chars of text='{text[:50]}...'")
	print(f" Count: {len(positions)}, removing at positions: {positions_to_remove}")

	# 3b. Process bboxes that appear 2 or more times
	for bbox_key, positions in bbox_pairs.items():
	if len(positions) >= 2:
	# Keep the first occurrence, remove subsequent duplicates
	positions_to_remove = positions[1:]
	duplicates_to_remove.update(positions_to_remove)

	print(f" 🔍 Found duplicate bbox: {list(bbox_key)}")
	print(f" Count: {len(positions)}, removing at positions: {positions_to_remove}")

	if not duplicates_to_remove:
	print(f" ✅ No category-text pairs or bboxes found exceeding the duplication threshold")
	return data_list

	# 4. Remove duplicate items from the original data (preserving order)
	cleaned_data = []
	removed_count = 0
	for i, item in enumerate(data_list):
	if i not in duplicates_to_remove:
	cleaned_data.append(item)
	else:
	removed_count += 1

	print(f" ✅ Deduplication complete: Removed {removed_count} duplicate items")
	print(f" 📊 Cleaned data length: {len(cleaned_data)}")

	return cleaned_data

	def clean_model_output(self, model_output: str):
	try:
	# Select cleaning method based on data type
	if isinstance(model_output, list):
	result = self.clean_list_data(model_output, case_id=0)
	else:
	result = self.clean_string_data(str(model_output), case_id=0)

	# Add deduplication step: remove duplicate category-text pairs and bboxes
	if result and hasattr(result, 'success') and result.success and result.cleaned_data:
	original_data = result.cleaned_data
	deduplicated_data = self.remove_duplicate_category_text_pairs_and_bbox(original_data, case_id=0)
	# Update the cleaned_data in the CleanedData object
	result.cleaned_data = deduplicated_data
	return result.cleaned_data
	except Exception as e:
	print(f"❌ Case cleaning failed: {e}")
	return model_output

	def clean_all_data(self, jsonl_path: str) -> List[CleanedData]:
	"""Cleans all data from a JSONL file"""

	print(f"🚀 Starting to clean JSONL file: {jsonl_path}")

	with open(jsonl_path, 'r', encoding='utf-8') as f:
	lines = f.readlines()

	datas = []
	for i, line in enumerate(lines):
	if line.strip():
	try:
	data = json.loads(line)
	predict_field = data.get('predict')
	case_id = i + 1

	print(f"\n{'='*50}")
	print(f"🎯 Cleaning Case {case_id}")
	print(f"{'='*50}")

	# Select cleaning method based on data type
	if isinstance(predict_field, list):
	print("📊 Data type: List")
	result = self.clean_list_data(predict_field, case_id)
	else:
	print("📊 Data type: String")
	result = self.clean_string_data(str(predict_field), case_id)

	# Add deduplication step: remove duplicate category-text pairs and bboxes
	if result and hasattr(result, 'success') and result.success and result.cleaned_data:
	print("🔄 Checking for and removing duplicate category-text pairs and bboxes...")
	original_data = result.cleaned_data
	deduplicated_data = self.remove_duplicate_category_text_pairs_and_bbox(original_data, case_id)
	# Update the cleaned_data in the CleanedData object
	result.cleaned_data = deduplicated_data
	data['predict_resized'] = result.cleaned_data

	datas.append(data)
	self.cleaned_results.append(result)

	except Exception as e:
	print(f"❌ Case {i+1} cleaning failed: {e}")
	traceback.print_exc()

	save_path = jsonl_path.replace('.jsonl', '_filtered.jsonl')
	with open(save_path, 'w') as w:
	for data in datas:
	w.write(json.dumps(data, ensure_ascii=False) + '\n')
	print(f"✅ Saved cleaned data to: {save_path}")

	return self.cleaned_results

	def save_cleaned_data(self, output_dir: str):
	"""Saves the cleaned data"""

	print(f"\n💾 Saving cleaned data to: {output_dir}")
	os.makedirs(output_dir, exist_ok=True)

	# 1. Save cleaned data for each case
	for result in self.cleaned_results:
	case_filename = f"cleaned_case_{result.case_id:02d}.json"
	case_filepath = os.path.join(output_dir, case_filename)

	# Save the cleaned data
	with open(case_filepath, 'w', encoding='utf-8') as f:
	json.dump(result.cleaned_data, f, ensure_ascii=False, indent=2)

	print(f" ✅ Case {result.case_id}: {len(result.cleaned_data)} objects → {case_filename}")

	# 2. Save all cleaned data to a single file
	all_cleaned_data = []
	for result in self.cleaned_results:
	all_cleaned_data.append({
	'case_id': result.case_id,
	'original_type': result.original_type,
	'original_length': result.original_length,
	'cleaned_objects_count': len(result.cleaned_data),
	'success': result.success,
	'cleaning_operations': result.cleaning_operations,
	'cleaned_data': result.cleaned_data
	})

	all_data_filepath = os.path.join(output_dir, "all_cleaned_data.json")
	with open(all_data_filepath, 'w', encoding='utf-8') as f:
	json.dump(all_cleaned_data, f, ensure_ascii=False, indent=2)

	print(f" 📁 All data: {len(all_cleaned_data)} cases → all_cleaned_data.json")

	# 3. Generate a cleaning report
	self._generate_cleaning_report(output_dir)

	def _generate_cleaning_report(self, output_dir: str):
	"""Generates a cleaning report"""

	report = []
	report.append("📊 Data Cleaning Report")
	report.append("=" * 60)
	import datetime
	report.append(f"Processing Time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	report.append("")

	# Overall statistics
	total_cases = len(self.cleaned_results)
	successful_cases = sum(1 for r in self.cleaned_results if r.success)
	total_objects = sum(len(r.cleaned_data) for r in self.cleaned_results)

	report.append("📈 Overall Statistics:")
	report.append(f" Total Cases: {total_cases}")
	report.append(f" Successfully Cleaned: {successful_cases}")
	report.append(f" Success Rate: {successful_cases/total_cases*100:.1f}%")
	report.append(f" Total Recovered Objects: {total_objects}")
	report.append("")

	# Detailed statistics
	list_results = [r for r in self.cleaned_results if r.original_type == 'list']
	str_results = [r for r in self.cleaned_results if r.original_type == 'str']

	if list_results:
	report.append("📋 List Type Cleaning Statistics:")
	for r in list_results:
	ops = r.cleaning_operations
	report.append(f" Case {r.case_id}: {ops['original_count']} → {ops['final_count']} objects")
	if ops['bbox_fixes'] > 0:
	report.append(f" - bbox fixes: {ops['bbox_fixes']}")
	if ops['removed_items'] > 0:
	report.append(f" - invalid items removed: {ops['removed_items']}")
	report.append("")

	if str_results:
	report.append("📝 String Type Cleaning Statistics:")
	for r in str_results:
	ops = r.cleaning_operations
	status = "✅" if r.success else "❌"
	report.append(f" Case {r.case_id} {status}: {ops['original_length']:,} chars → {ops['final_objects']} objects")
	details = []
	if ops['delimiter_fixes'] > 0:
	details.append(f"Delimiter fixes: {ops['delimiter_fixes']}")
	if ops['tail_truncated']:
	reduction = ops['original_length'] - ops['truncated_length']
	details.append(f"Tail truncation: -{reduction:,} chars")
	if ops['duplicate_dicts_removed'] > 0:
	details.append(f"Duplicates removed: {ops['duplicate_dicts_removed']}")
	if details:
	report.append(f" - {', '.join(details)}")
	report.append("")

	# Note on data order
	report.append("🔄 Data Order Guarantee:")
	report.append(" ✅ The relative order of all dict objects is preserved during cleaning.")
	report.append(" ✅ When deduplicating, the first occurrence of a dict is kept, and subsequent duplicates are removed.")
	report.append(" ✅ The order of items in List-type data is fully preserved.")

	# Save the report
	report_filepath = os.path.join(output_dir, "cleaning_report.txt")
	with open(report_filepath, 'w', encoding='utf-8') as f:
	f.write('\n'.join(report))

	print(f" 📋 Cleaning report: cleaning_report.txt")

	# Also print to console
	print(f"\n{chr(10).join(report)}")


	def main():
	"""Main function"""

	# Create a data cleaner instance
	cleaner = OutputCleaner()

	# Input file
	jsonl_path = "output_with_failcase.jsonl"

	# Output directory
	output_dir = "output_with_failcase_cleaned"

	# Clean all data
	results = cleaner.clean_all_data(jsonl_path)

	# Save the cleaned data
	cleaner.save_cleaned_data(output_dir)

	print(f"\n🎉 Data cleaning complete!")
	print(f"📁 Cleaned data saved in: {output_dir}")


	if __name__ == "__main__":
	main()