redhairedshanks1 commited on
Commit
1482837
Β·
1 Parent(s): dd3d80f

Update utilities/extract_tables.py

Browse files
Files changed (1) hide show
  1. utilities/extract_tables.py +47 -6
utilities/extract_tables.py CHANGED
@@ -49,6 +49,15 @@ def extract_tables_remote(state):
49
 
50
  # Extract just the filename (not full path) to match curl format
51
  file_basename = os.path.basename(path)
 
 
 
 
 
 
 
 
 
52
 
53
  with open(path, "rb") as f:
54
  # IMPORTANT: Use basename for the file tuple (matches curl format)
@@ -62,12 +71,44 @@ def extract_tables_remote(state):
62
  }
63
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
64
 
65
- # Call API and wait for response
66
- resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers, timeout=120)
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  if resp.status_code != 200:
69
- raise RuntimeError(f"Extract tables API failed: {resp.text}")
 
70
 
71
- js = resp.json()
72
- state["tables"] = js.get("tables", js)
73
- return state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # Extract just the filename (not full path) to match curl format
51
  file_basename = os.path.basename(path)
52
+ file_size = os.path.getsize(path)
53
+
54
+ print(f"\n{'='*60}")
55
+ print(f"πŸ“Š EXTRACT TABLES API CALL")
56
+ print(f"{'='*60}")
57
+ print(f"File: {filename}")
58
+ print(f"Basename: {file_basename}")
59
+ print(f"Size: {file_size} bytes")
60
+ print(f"API URL: {EXTRACT_TABLES_API}")
61
 
62
  with open(path, "rb") as f:
63
  # IMPORTANT: Use basename for the file tuple (matches curl format)
 
71
  }
72
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
73
 
74
+ print(f"πŸš€ Sending request...")
75
+ print(f"Data params: {data}")
76
+
77
+ # Call API and wait for response - 5 minute timeout for large files
78
+ try:
79
+ resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers, timeout=300)
80
+ except requests.exceptions.Timeout:
81
+ print(f"❌ Request timed out after 300 seconds")
82
+ raise RuntimeError("Table extraction API timed out after 5 minutes")
83
+ except requests.exceptions.RequestException as e:
84
+ print(f"❌ Request exception: {str(e)}")
85
+ raise RuntimeError(f"API request failed: {str(e)}")
86
+
87
+ print(f"\nπŸ“₯ API Response:")
88
+ print(f"Status Code: {resp.status_code}")
89
 
90
  if resp.status_code != 200:
91
+ print(f"❌ Error: {resp.text[:500]}")
92
+ raise RuntimeError(f"Extract tables API failed with status {resp.status_code}: {resp.text}")
93
 
94
+ try:
95
+ response_json = resp.json()
96
+ print(f"Response keys: {list(response_json.keys())}")
97
+
98
+ # IMPORTANT: API returns {"status": "completed", "result": [...tables...], "process_id": "..."}
99
+ # NOT {"tables": [...]}
100
+ tables = response_json.get("result", []) or response_json.get("tables", [])
101
+
102
+ print(f"\nπŸ“Š Extraction Result:")
103
+ print(f"API Status: {response_json.get('status', 'unknown')}")
104
+ print(f"Process ID: {response_json.get('process_id', 'none')}")
105
+ print(f"Tables found: {len(tables) if isinstance(tables, list) else 0}")
106
+ print(f"{'='*60}\n")
107
+
108
+ state["tables"] = tables
109
+ return state
110
+
111
+ except Exception as e:
112
+ print(f"❌ Error parsing response: {str(e)}")
113
+ print(f"Raw response: {resp.text[:500]}")
114
+ raise RuntimeError(f"Failed to parse API response: {str(e)}")