redhairedshanks1 commited on
Commit
d68f17d
Β·
1 Parent(s): 7b72f28

Update utilities/extract_text.py

Browse files
Files changed (1) hide show
  1. utilities/extract_text.py +47 -3
utilities/extract_text.py CHANGED
@@ -46,6 +46,20 @@ def extract_text_remote(state):
46
  if not os.path.exists(path):
47
  raise RuntimeError(f"File not found: {path}")
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  with open(path, "rb") as f:
50
  files = {"file": (filename, f, "application/pdf")}
51
  data = {
@@ -55,11 +69,41 @@ def extract_text_remote(state):
55
  }
56
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
57
 
 
 
 
58
  # Call API and wait for response
59
- resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers)
60
 
 
 
 
 
61
  if resp.status_code != 200:
 
62
  raise RuntimeError(f"Extract text API failed: {resp.text}")
63
 
64
- state["text"] = resp.json().get("text", "")
65
- return state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  if not os.path.exists(path):
47
  raise RuntimeError(f"File not found: {path}")
48
 
49
+ # Get file size for debugging
50
+ file_size = os.path.getsize(path)
51
+
52
+ print(f"\n{'='*60}")
53
+ print(f"πŸ“„ EXTRACT TEXT API CALL")
54
+ print(f"{'='*60}")
55
+ print(f"File: {filename}")
56
+ print(f"Path: {path}")
57
+ print(f"Size: {file_size} bytes")
58
+ print(f"Start Page: {state.get('start_page', 1)}")
59
+ print(f"End Page: {state.get('end_page', 1)}")
60
+ print(f"API URL: {EXTRACT_TEXT_API}")
61
+ print(f"Auth Token: {'βœ“ Set' if os.getenv('HUGGINGFACE_API_TOKEN') else 'βœ— Not Set'}")
62
+
63
  with open(path, "rb") as f:
64
  files = {"file": (filename, f, "application/pdf")}
65
  data = {
 
69
  }
70
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
71
 
72
+ print(f"\nπŸš€ Sending request to API...")
73
+ print(f"Data params: {data}")
74
+
75
  # Call API and wait for response
76
+ resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers, timeout=120)
77
 
78
+ print(f"\nπŸ“₯ API Response:")
79
+ print(f"Status Code: {resp.status_code}")
80
+ print(f"Response Headers: {dict(resp.headers)}")
81
+
82
  if resp.status_code != 200:
83
+ print(f"❌ Error Response: {resp.text[:500]}")
84
  raise RuntimeError(f"Extract text API failed: {resp.text}")
85
 
86
+ try:
87
+ response_json = resp.json()
88
+ print(f"Response JSON keys: {list(response_json.keys())}")
89
+ print(f"Response JSON: {str(response_json)[:500]}")
90
+
91
+ extracted_text = response_json.get("text", "")
92
+ text_length = len(extracted_text) if extracted_text else 0
93
+
94
+ print(f"\nπŸ“Š Extraction Result:")
95
+ print(f"Text Length: {text_length} characters")
96
+ if text_length > 0:
97
+ print(f"First 200 chars: {extracted_text[:200]}")
98
+ else:
99
+ print(f"⚠️ WARNING: API returned EMPTY text!")
100
+ print(f"Full response: {response_json}")
101
+
102
+ state["text"] = extracted_text
103
+ print(f"{'='*60}\n")
104
+ return state
105
+
106
+ except Exception as e:
107
+ print(f"❌ Error parsing response: {str(e)}")
108
+ print(f"Raw response: {resp.text[:500]}")
109
+ raise RuntimeError(f"Failed to parse API response: {str(e)}")