redhairedshanks1 commited on
Commit
37b146d
Β·
1 Parent(s): d68f17d

Update utilities/extract_text.py

Browse files
Files changed (1) hide show
  1. utilities/extract_text.py +13 -4
utilities/extract_text.py CHANGED
@@ -49,10 +49,14 @@ def extract_text_remote(state):
49
  # Get file size for debugging
50
  file_size = os.path.getsize(path)
51
 
 
 
 
52
  print(f"\n{'='*60}")
53
  print(f"πŸ“„ EXTRACT TEXT API CALL")
54
  print(f"{'='*60}")
55
  print(f"File: {filename}")
 
56
  print(f"Path: {path}")
57
  print(f"Size: {file_size} bytes")
58
  print(f"Start Page: {state.get('start_page', 1)}")
@@ -61,16 +65,21 @@ def extract_text_remote(state):
61
  print(f"Auth Token: {'βœ“ Set' if os.getenv('HUGGINGFACE_API_TOKEN') else 'βœ— Not Set'}")
62
 
63
  with open(path, "rb") as f:
64
- files = {"file": (filename, f, "application/pdf")}
 
 
 
65
  data = {
66
- "filename": filename,
67
- "start_page": state.get("start_page", 1),
68
- "end_page": state.get("end_page", 1)
69
  }
70
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
71
 
72
  print(f"\nπŸš€ Sending request to API...")
 
73
  print(f"Data params: {data}")
 
74
 
75
  # Call API and wait for response
76
  resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers, timeout=120)
 
49
  # Get file size for debugging
50
  file_size = os.path.getsize(path)
51
 
52
+ # Extract just the filename (not full path) to match curl format
53
+ file_basename = os.path.basename(path)
54
+
55
  print(f"\n{'='*60}")
56
  print(f"πŸ“„ EXTRACT TEXT API CALL")
57
  print(f"{'='*60}")
58
  print(f"File: {filename}")
59
+ print(f"Basename: {file_basename}")
60
  print(f"Path: {path}")
61
  print(f"Size: {file_size} bytes")
62
  print(f"Start Page: {state.get('start_page', 1)}")
 
65
  print(f"Auth Token: {'βœ“ Set' if os.getenv('HUGGINGFACE_API_TOKEN') else 'βœ— Not Set'}")
66
 
67
  with open(path, "rb") as f:
68
+ # IMPORTANT: Use basename for the file tuple (matches curl format)
69
+ files = {"file": (file_basename, f, "application/pdf")}
70
+
71
+ # IMPORTANT: Convert page numbers to strings (matches curl -F format)
72
  data = {
73
+ "filename": file_basename, # Just filename, not full path
74
+ "start_page": str(state.get("start_page", 1)), # String, not int
75
+ "end_page": str(state.get("end_page", 1)) # String, not int
76
  }
77
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
78
 
79
  print(f"\nπŸš€ Sending request to API...")
80
+ print(f"File tuple: ('file', ('{file_basename}', <binary>, 'application/pdf'))")
81
  print(f"Data params: {data}")
82
+ print(f"Data types: start_page={type(data['start_page'])}, end_page={type(data['end_page'])}")
83
 
84
  # Call API and wait for response
85
  resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers, timeout=120)