File size: 4,379 Bytes
bf161ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b72f28
 
bf161ca
 
7b72f28
 
 
bf161ca
7b72f28
 
bf161ca
7b72f28
 
 
 
 
 
 
 
d4c0b27
 
 
1482837
 
 
 
 
 
 
 
 
7b72f28
bf161ca
d4c0b27
 
 
 
bf161ca
d4c0b27
 
 
bf161ca
 
7b72f28
1482837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf161ca
 
1482837
 
bf161ca
1482837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# import os
# import requests

# EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables"  # Replace with your space URL

# def extract_tables_remote(state):
#     filename = state["filename"]
#     path = state["temp_files"][filename]

#     with open(path, "rb") as f:
#         files = {"file": (filename, f, "application/pdf")}
#         data = {
#             "filename": filename,
#             "start_page": state.get("start_page", 1),
#             "end_page": state.get("end_page", 1),
#         }
#         headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
#         resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers)

#     if resp.status_code != 200:
#         raise RuntimeError(f"Extract tables API failed: {resp.text}")

#     js = resp.json()
#     state["tables"] = js.get("tables", js)
#     return state

import os
import requests

# Hardcoded API URL - DO NOT CHANGE
EXTRACT_TABLES_API = "https://point9-extract-text-and-table.hf.space/api/tables"

def extract_tables_remote(state):
    """
    Extract tables from documents via API.
    All file types are sent to the API (no local processing).
    
    Args:
        state: Dict with 'filename', 'temp_files', 'start_page', 'end_page'
    
    Returns:
        state: Dict with 'tables' key containing list of extracted tables
    """
    filename = state["filename"]
    path = state["temp_files"][filename]

    if not os.path.exists(path):
        raise RuntimeError(f"File not found: {path}")
    
    # Extract just the filename (not full path) to match curl format
    file_basename = os.path.basename(path)
    file_size = os.path.getsize(path)

    print(f"\n{'='*60}")
    print(f"πŸ“Š EXTRACT TABLES API CALL")
    print(f"{'='*60}")
    print(f"File: {filename}")
    print(f"Basename: {file_basename}")
    print(f"Size: {file_size} bytes")
    print(f"API URL: {EXTRACT_TABLES_API}")

    with open(path, "rb") as f:
        # IMPORTANT: Use basename for the file tuple (matches curl format)
        files = {"file": (file_basename, f, "application/pdf")}
        
        # IMPORTANT: Convert page numbers to strings (matches curl -F format)
        data = {
            "filename": file_basename,  # Just filename, not full path
            "start_page": str(state.get("start_page", 1)),  # String, not int
            "end_page": str(state.get("end_page", 1)),  # String, not int
        }
        headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
        
        print(f"πŸš€ Sending request...")
        print(f"Data params: {data}")
        
        # Call API and wait for response - 5 minute timeout for large files
        try:
            resp = requests.post(EXTRACT_TABLES_API, files=files, data=data, headers=headers, timeout=300)
        except requests.exceptions.Timeout:
            print(f"❌ Request timed out after 300 seconds")
            raise RuntimeError("Table extraction API timed out after 5 minutes")
        except requests.exceptions.RequestException as e:
            print(f"❌ Request exception: {str(e)}")
            raise RuntimeError(f"API request failed: {str(e)}")

    print(f"\nπŸ“₯ API Response:")
    print(f"Status Code: {resp.status_code}")

    if resp.status_code != 200:
        print(f"❌ Error: {resp.text[:500]}")
        raise RuntimeError(f"Extract tables API failed with status {resp.status_code}: {resp.text}")

    try:
        response_json = resp.json()
        print(f"Response keys: {list(response_json.keys())}")
        
        # IMPORTANT: API returns {"status": "completed", "result": [...tables...], "process_id": "..."}
        # NOT {"tables": [...]}
        tables = response_json.get("result", []) or response_json.get("tables", [])
        
        print(f"\nπŸ“Š Extraction Result:")
        print(f"API Status: {response_json.get('status', 'unknown')}")
        print(f"Process ID: {response_json.get('process_id', 'none')}")
        print(f"Tables found: {len(tables) if isinstance(tables, list) else 0}")
        print(f"{'='*60}\n")
        
        state["tables"] = tables
        return state
        
    except Exception as e:
        print(f"❌ Error parsing response: {str(e)}")
        print(f"Raw response: {resp.text[:500]}")
        raise RuntimeError(f"Failed to parse API response: {str(e)}")