Spaces:
Runtime error
Runtime error
changes on logging for better debugging
Browse files- setup.sh +44 -1
- src/parsers/got_ocr_parser.py +73 -80
setup.sh
CHANGED
|
@@ -5,6 +5,9 @@ set -e
|
|
| 5 |
|
| 6 |
echo "Starting setup process..."
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
# Check if running with sudo/root permissions for system dependencies
|
| 9 |
if [ "$EUID" -eq 0 ]; then
|
| 10 |
# Install system dependencies
|
|
@@ -12,7 +15,8 @@ if [ "$EUID" -eq 0 ]; then
|
|
| 12 |
apt-get update && apt-get install -y \
|
| 13 |
wget \
|
| 14 |
pkg-config \
|
| 15 |
-
git
|
|
|
|
| 16 |
echo "System dependencies installed successfully"
|
| 17 |
else
|
| 18 |
echo "Not running as root. Skipping system dependencies installation."
|
|
@@ -41,6 +45,42 @@ echo "Installing Hugging Face CLI..."
|
|
| 41 |
pip install -q -U "huggingface_hub[cli]"
|
| 42 |
echo "Hugging Face CLI installed successfully"
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Install the project in development mode only if setup.py or pyproject.toml exists
|
| 45 |
if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
|
| 46 |
echo "Installing project in development mode..."
|
|
@@ -62,4 +102,7 @@ if [ ! -f .env ]; then
|
|
| 62 |
fi
|
| 63 |
fi
|
| 64 |
|
|
|
|
|
|
|
|
|
|
| 65 |
echo "Setup process completed successfully!"
|
|
|
|
| 5 |
|
| 6 |
echo "Starting setup process..."
|
| 7 |
|
| 8 |
+
# Enable more verbose logging
|
| 9 |
+
set -x
|
| 10 |
+
|
| 11 |
# Check if running with sudo/root permissions for system dependencies
|
| 12 |
if [ "$EUID" -eq 0 ]; then
|
| 13 |
# Install system dependencies
|
|
|
|
| 15 |
apt-get update && apt-get install -y \
|
| 16 |
wget \
|
| 17 |
pkg-config \
|
| 18 |
+
git \
|
| 19 |
+
tree # Add tree for directory structure visualization
|
| 20 |
echo "System dependencies installed successfully"
|
| 21 |
else
|
| 22 |
echo "Not running as root. Skipping system dependencies installation."
|
|
|
|
| 45 |
pip install -q -U "huggingface_hub[cli]"
|
| 46 |
echo "Hugging Face CLI installed successfully"
|
| 47 |
|
| 48 |
+
# Add debug section for GOT-OCR repo
|
| 49 |
+
echo "===== GOT-OCR Repository Debugging ====="
|
| 50 |
+
|
| 51 |
+
# Clone the repository for inspection (if it doesn't exist)
|
| 52 |
+
TEMP_DIR="/tmp"
|
| 53 |
+
REPO_DIR="${TEMP_DIR}/GOT-OCR2.0"
|
| 54 |
+
|
| 55 |
+
if [ ! -d "$REPO_DIR" ]; then
|
| 56 |
+
echo "Cloning GOT-OCR2.0 repository for debugging..."
|
| 57 |
+
git clone https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git "$REPO_DIR"
|
| 58 |
+
else
|
| 59 |
+
echo "GOT-OCR2.0 repository already exists at $REPO_DIR"
|
| 60 |
+
fi
|
| 61 |
+
|
| 62 |
+
# Check the repository structure
|
| 63 |
+
echo "GOT-OCR2.0 repository structure:"
|
| 64 |
+
if command -v tree &> /dev/null; then
|
| 65 |
+
tree -L 3 "$REPO_DIR"
|
| 66 |
+
else
|
| 67 |
+
find "$REPO_DIR" -type d -maxdepth 3 | sort
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
# Check if the demo script exists
|
| 71 |
+
DEMO_SCRIPT="${REPO_DIR}/GOT/demo/run_ocr_2.0.py"
|
| 72 |
+
if [ -f "$DEMO_SCRIPT" ]; then
|
| 73 |
+
echo "Demo script found at: $DEMO_SCRIPT"
|
| 74 |
+
else
|
| 75 |
+
echo "ERROR: Demo script not found at: $DEMO_SCRIPT"
|
| 76 |
+
|
| 77 |
+
# Search for the script in the repository
|
| 78 |
+
echo "Searching for run_ocr_2.0.py in the repository..."
|
| 79 |
+
find "$REPO_DIR" -name "run_ocr_2.0.py" -type f
|
| 80 |
+
fi
|
| 81 |
+
|
| 82 |
+
echo "===== End of GOT-OCR Debugging ====="
|
| 83 |
+
|
| 84 |
# Install the project in development mode only if setup.py or pyproject.toml exists
|
| 85 |
if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
|
| 86 |
echo "Installing project in development mode..."
|
|
|
|
| 102 |
fi
|
| 103 |
fi
|
| 104 |
|
| 105 |
+
# Return to normal logging
|
| 106 |
+
set +x
|
| 107 |
+
|
| 108 |
echo "Setup process completed successfully!"
|
src/parsers/got_ocr_parser.py
CHANGED
|
@@ -15,6 +15,8 @@ import latex2markdown
|
|
| 15 |
|
| 16 |
# Configure logging
|
| 17 |
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
| 18 |
|
| 19 |
class GotOcrParser(DocumentParser):
|
| 20 |
"""Parser implementation using GOT-OCR 2.0 for document text extraction using GitHub repository.
|
|
@@ -26,7 +28,6 @@ class GotOcrParser(DocumentParser):
|
|
| 26 |
# Path to the GOT-OCR repository
|
| 27 |
_repo_path = None
|
| 28 |
_weights_path = None
|
| 29 |
-
_demo_script_path = None
|
| 30 |
|
| 31 |
@classmethod
|
| 32 |
def get_name(cls) -> str:
|
|
@@ -79,69 +80,68 @@ class GotOcrParser(DocumentParser):
|
|
| 79 |
logger.error(f"Missing dependency: {e}")
|
| 80 |
return False
|
| 81 |
|
| 82 |
-
@classmethod
|
| 83 |
-
def _find_demo_script(cls, base_dir):
|
| 84 |
-
"""Find the run_ocr_2.0.py script by searching the repository.
|
| 85 |
-
|
| 86 |
-
Args:
|
| 87 |
-
base_dir: The base directory to start searching from
|
| 88 |
-
|
| 89 |
-
Returns:
|
| 90 |
-
Path to the script if found, None otherwise
|
| 91 |
-
"""
|
| 92 |
-
logger.info(f"Searching for run_ocr_2.0.py in {base_dir}")
|
| 93 |
-
script_paths = []
|
| 94 |
-
|
| 95 |
-
# Walk through all directories and find all instances of run_ocr_2.0.py
|
| 96 |
-
for root, dirs, files in os.walk(base_dir):
|
| 97 |
-
if "run_ocr_2.0.py" in files:
|
| 98 |
-
script_path = os.path.join(root, "run_ocr_2.0.py")
|
| 99 |
-
script_paths.append(script_path)
|
| 100 |
-
logger.info(f"Found run_ocr_2.0.py at: {script_path}")
|
| 101 |
-
|
| 102 |
-
if not script_paths:
|
| 103 |
-
logger.error("Could not find run_ocr_2.0.py in the repository")
|
| 104 |
-
return None
|
| 105 |
-
|
| 106 |
-
# If there are multiple instances, try to find the one in demo folder
|
| 107 |
-
for path in script_paths:
|
| 108 |
-
if os.path.join("demo", "run_ocr_2.0.py") in path:
|
| 109 |
-
logger.info(f"Selected demo script at: {path}")
|
| 110 |
-
return path
|
| 111 |
-
|
| 112 |
-
# If no clear demo folder, just use the first one found
|
| 113 |
-
logger.info(f"Selected demo script at: {script_paths[0]}")
|
| 114 |
-
return script_paths[0]
|
| 115 |
-
|
| 116 |
@classmethod
|
| 117 |
def _setup_repository(cls) -> bool:
|
| 118 |
"""Set up the GOT-OCR2.0 repository if it's not already set up."""
|
| 119 |
-
if cls._repo_path is not None and os.path.exists(cls._repo_path)
|
|
|
|
| 120 |
return True
|
| 121 |
|
| 122 |
try:
|
| 123 |
# Create a temporary directory for the repository
|
| 124 |
repo_dir = os.path.join(tempfile.gettempdir(), "GOT-OCR2.0")
|
|
|
|
| 125 |
|
| 126 |
# Check if the repository already exists
|
| 127 |
if not os.path.exists(repo_dir):
|
| 128 |
-
logger.info("Cloning GOT-OCR2.0 repository...")
|
| 129 |
subprocess.run(
|
| 130 |
["git", "clone", "https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git", repo_dir],
|
| 131 |
check=True
|
| 132 |
)
|
| 133 |
else:
|
| 134 |
-
logger.info("GOT-OCR2.0 repository already exists, skipping clone")
|
| 135 |
|
| 136 |
cls._repo_path = repo_dir
|
| 137 |
|
| 138 |
-
#
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
# Set up the weights directory
|
| 147 |
weights_dir = os.path.join(repo_dir, "GOT_weights")
|
|
@@ -149,6 +149,7 @@ class GotOcrParser(DocumentParser):
|
|
| 149 |
os.makedirs(weights_dir, exist_ok=True)
|
| 150 |
|
| 151 |
cls._weights_path = weights_dir
|
|
|
|
| 152 |
|
| 153 |
# Check if weights exist, if not download them
|
| 154 |
weight_files = [f for f in os.listdir(weights_dir) if f.endswith(".bin") or f.endswith(".safetensors")]
|
|
@@ -221,17 +222,35 @@ class GotOcrParser(DocumentParser):
|
|
| 221 |
try:
|
| 222 |
logger.info(f"Processing image with GOT-OCR: {file_path}")
|
| 223 |
|
| 224 |
-
# Check if demo script exists
|
| 225 |
-
if not self._demo_script_path or not os.path.exists(self._demo_script_path):
|
| 226 |
-
logger.warning("Demo script path not found, trying to locate it again")
|
| 227 |
-
self._demo_script_path = self._find_demo_script(self._repo_path)
|
| 228 |
-
if not self._demo_script_path:
|
| 229 |
-
raise RuntimeError("Could not find the run_ocr_2.0.py script in the repository")
|
| 230 |
-
|
| 231 |
# Create the command for running the GOT-OCR script
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
cmd = [
|
| 233 |
sys.executable,
|
| 234 |
-
|
| 235 |
"--model-name", self._weights_path,
|
| 236 |
"--image-file", str(file_path),
|
| 237 |
"--type", ocr_type
|
|
@@ -263,18 +282,7 @@ class GotOcrParser(DocumentParser):
|
|
| 263 |
# If render was requested, find and return the path to the HTML file
|
| 264 |
if render:
|
| 265 |
# The rendered results are in /results/demo.html according to the README
|
| 266 |
-
|
| 267 |
-
if not os.path.exists(results_dir):
|
| 268 |
-
# Try to find results directory
|
| 269 |
-
for root, dirs, files in os.walk(self._repo_path):
|
| 270 |
-
if "demo.html" in files:
|
| 271 |
-
html_result_path = os.path.join(root, "demo.html")
|
| 272 |
-
logger.info(f"Found rendered HTML at: {html_result_path}")
|
| 273 |
-
with open(html_result_path, 'r') as f:
|
| 274 |
-
html_content = f.read()
|
| 275 |
-
return html_content
|
| 276 |
-
|
| 277 |
-
html_result_path = os.path.join(results_dir, "demo.html")
|
| 278 |
if os.path.exists(html_result_path):
|
| 279 |
with open(html_result_path, 'r') as f:
|
| 280 |
html_content = f.read()
|
|
@@ -294,21 +302,6 @@ class GotOcrParser(DocumentParser):
|
|
| 294 |
except subprocess.CalledProcessError as e:
|
| 295 |
logger.error(f"Error running GOT-OCR command: {str(e)}")
|
| 296 |
logger.error(f"Stderr: {e.stderr}")
|
| 297 |
-
|
| 298 |
-
# Print repository structure for debugging
|
| 299 |
-
logger.error("Repository structure for debugging:")
|
| 300 |
-
try:
|
| 301 |
-
subprocess.run(
|
| 302 |
-
["find", self._repo_path, "-type", "f", "-name", "*.py"],
|
| 303 |
-
check=True,
|
| 304 |
-
capture_output=True,
|
| 305 |
-
text=True
|
| 306 |
-
)
|
| 307 |
-
structure_output = subprocess.getoutput(f"find {self._repo_path} -type f -name '*.py'")
|
| 308 |
-
logger.error(f"Python files in repository:\n{structure_output}")
|
| 309 |
-
except Exception as debug_e:
|
| 310 |
-
logger.error(f"Error getting repository structure: {debug_e}")
|
| 311 |
-
|
| 312 |
raise RuntimeError(f"Error processing document with GOT-OCR: {str(e)}")
|
| 313 |
|
| 314 |
except Exception as e:
|
|
|
|
| 15 |
|
| 16 |
# Configure logging
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
+
# Set logger level to DEBUG for more verbose output
|
| 19 |
+
logger.setLevel(logging.DEBUG)
|
| 20 |
|
| 21 |
class GotOcrParser(DocumentParser):
|
| 22 |
"""Parser implementation using GOT-OCR 2.0 for document text extraction using GitHub repository.
|
|
|
|
| 28 |
# Path to the GOT-OCR repository
|
| 29 |
_repo_path = None
|
| 30 |
_weights_path = None
|
|
|
|
| 31 |
|
| 32 |
@classmethod
|
| 33 |
def get_name(cls) -> str:
|
|
|
|
| 80 |
logger.error(f"Missing dependency: {e}")
|
| 81 |
return False
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
@classmethod
|
| 84 |
def _setup_repository(cls) -> bool:
|
| 85 |
"""Set up the GOT-OCR2.0 repository if it's not already set up."""
|
| 86 |
+
if cls._repo_path is not None and os.path.exists(cls._repo_path):
|
| 87 |
+
logger.debug(f"Repository already set up at: {cls._repo_path}")
|
| 88 |
return True
|
| 89 |
|
| 90 |
try:
|
| 91 |
# Create a temporary directory for the repository
|
| 92 |
repo_dir = os.path.join(tempfile.gettempdir(), "GOT-OCR2.0")
|
| 93 |
+
logger.debug(f"Repository directory: {repo_dir}")
|
| 94 |
|
| 95 |
# Check if the repository already exists
|
| 96 |
if not os.path.exists(repo_dir):
|
| 97 |
+
logger.info(f"Cloning GOT-OCR2.0 repository to {repo_dir}...")
|
| 98 |
subprocess.run(
|
| 99 |
["git", "clone", "https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git", repo_dir],
|
| 100 |
check=True
|
| 101 |
)
|
| 102 |
else:
|
| 103 |
+
logger.info(f"GOT-OCR2.0 repository already exists at {repo_dir}, skipping clone")
|
| 104 |
|
| 105 |
cls._repo_path = repo_dir
|
| 106 |
|
| 107 |
+
# Debug: List repository contents
|
| 108 |
+
logger.debug("Repository contents:")
|
| 109 |
+
try:
|
| 110 |
+
result = subprocess.run(
|
| 111 |
+
["find", repo_dir, "-type", "d", "-maxdepth", "3"],
|
| 112 |
+
check=True,
|
| 113 |
+
capture_output=True,
|
| 114 |
+
text=True
|
| 115 |
+
)
|
| 116 |
+
for line in result.stdout.splitlines():
|
| 117 |
+
logger.debug(f" {line}")
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.warning(f"Could not list repository contents: {e}")
|
| 120 |
+
|
| 121 |
+
# Check if the demo script exists
|
| 122 |
+
demo_script = os.path.join(repo_dir, "GOT", "demo", "run_ocr_2.0.py")
|
| 123 |
+
if os.path.exists(demo_script):
|
| 124 |
+
logger.info(f"Found demo script at: {demo_script}")
|
| 125 |
+
else:
|
| 126 |
+
logger.warning(f"Demo script not found at expected path: {demo_script}")
|
| 127 |
+
# Try to find it
|
| 128 |
+
logger.info("Searching for run_ocr_2.0.py in the repository...")
|
| 129 |
+
try:
|
| 130 |
+
find_result = subprocess.run(
|
| 131 |
+
["find", repo_dir, "-name", "run_ocr_2.0.py", "-type", "f"],
|
| 132 |
+
check=True,
|
| 133 |
+
capture_output=True,
|
| 134 |
+
text=True
|
| 135 |
+
)
|
| 136 |
+
if find_result.stdout.strip():
|
| 137 |
+
found_paths = find_result.stdout.strip().splitlines()
|
| 138 |
+
logger.info(f"Found script at alternative locations: {found_paths}")
|
| 139 |
+
# Use the first found path as fallback
|
| 140 |
+
if found_paths:
|
| 141 |
+
alternative_path = found_paths[0]
|
| 142 |
+
logger.info(f"Using alternative path: {alternative_path}")
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.warning(f"Could not search for script: {e}")
|
| 145 |
|
| 146 |
# Set up the weights directory
|
| 147 |
weights_dir = os.path.join(repo_dir, "GOT_weights")
|
|
|
|
| 149 |
os.makedirs(weights_dir, exist_ok=True)
|
| 150 |
|
| 151 |
cls._weights_path = weights_dir
|
| 152 |
+
logger.debug(f"Weights directory: {weights_dir}")
|
| 153 |
|
| 154 |
# Check if weights exist, if not download them
|
| 155 |
weight_files = [f for f in os.listdir(weights_dir) if f.endswith(".bin") or f.endswith(".safetensors")]
|
|
|
|
| 222 |
try:
|
| 223 |
logger.info(f"Processing image with GOT-OCR: {file_path}")
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
# Create the command for running the GOT-OCR script
|
| 226 |
+
script_path = os.path.join(self._repo_path, "GOT", "demo", "run_ocr_2.0.py")
|
| 227 |
+
|
| 228 |
+
# Check if the script exists at the expected path
|
| 229 |
+
if not os.path.exists(script_path):
|
| 230 |
+
logger.error(f"Script not found at: {script_path}")
|
| 231 |
+
|
| 232 |
+
# Try to find the script within the repository
|
| 233 |
+
logger.info("Searching for run_ocr_2.0.py in the repository...")
|
| 234 |
+
try:
|
| 235 |
+
find_result = subprocess.run(
|
| 236 |
+
["find", self._repo_path, "-name", "run_ocr_2.0.py", "-type", "f"],
|
| 237 |
+
check=True,
|
| 238 |
+
capture_output=True,
|
| 239 |
+
text=True
|
| 240 |
+
)
|
| 241 |
+
found_paths = find_result.stdout.strip().splitlines()
|
| 242 |
+
if found_paths:
|
| 243 |
+
script_path = found_paths[0]
|
| 244 |
+
logger.info(f"Found script at alternative location: {script_path}")
|
| 245 |
+
else:
|
| 246 |
+
raise FileNotFoundError(f"Could not find run_ocr_2.0.py in repository: {self._repo_path}")
|
| 247 |
+
except Exception as search_e:
|
| 248 |
+
logger.error(f"Error searching for script: {str(search_e)}")
|
| 249 |
+
raise FileNotFoundError(f"Script not found and search failed: {str(search_e)}")
|
| 250 |
+
|
| 251 |
cmd = [
|
| 252 |
sys.executable,
|
| 253 |
+
script_path,
|
| 254 |
"--model-name", self._weights_path,
|
| 255 |
"--image-file", str(file_path),
|
| 256 |
"--type", ocr_type
|
|
|
|
| 282 |
# If render was requested, find and return the path to the HTML file
|
| 283 |
if render:
|
| 284 |
# The rendered results are in /results/demo.html according to the README
|
| 285 |
+
html_result_path = os.path.join(self._repo_path, "results", "demo.html")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
if os.path.exists(html_result_path):
|
| 287 |
with open(html_result_path, 'r') as f:
|
| 288 |
html_content = f.read()
|
|
|
|
| 302 |
except subprocess.CalledProcessError as e:
|
| 303 |
logger.error(f"Error running GOT-OCR command: {str(e)}")
|
| 304 |
logger.error(f"Stderr: {e.stderr}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
raise RuntimeError(f"Error processing document with GOT-OCR: {str(e)}")
|
| 306 |
|
| 307 |
except Exception as e:
|