redhairedshanks1 commited on
Commit
39d1de3
·
1 Parent(s): 310e8f9

Update utilities/extract_text.py

Browse files
Files changed (1) hide show
  1. utilities/extract_text.py +59 -23
utilities/extract_text.py CHANGED
@@ -1,24 +1,60 @@
1
- import os
2
- import requests
3
-
4
- EXTRACT_TEXT_API = "https://point9-extract-text-and-table.hf.space/api/text" # Replace with your space URL
5
-
6
- def extract_text_remote(state):
7
- filename = state["filename"]
8
- path = state["temp_files"][filename]
9
-
10
- with open(path, "rb") as f:
11
- files = {"file": (filename, f, "application/pdf")}
12
- data = {
13
- "filename": filename,
14
- "start_page": state.get("start_page", 1),
15
- "end_page": state.get("end_page", 1)
16
- }
17
- headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
18
- resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers)
19
-
20
- if resp.status_code != 200:
21
- raise RuntimeError(f"Extract text API failed: {resp.text}")
22
-
23
- state["text"] = resp.json().get("text", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  return state
 
1
+ # import os
2
+ # import requests
3
+
4
+ # EXTRACT_TEXT_API = "https://point9-extract-text-and-table.hf.space/api/text" # Replace with your space URL
5
+
6
+ # def extract_text_remote(state):
7
+ # filename = state["filename"]
8
+ # path = state["temp_files"][filename]
9
+
10
+ # with open(path, "rb") as f:
11
+ # files = {"file": (filename, f, "application/pdf")}
12
+ # data = {
13
+ # "filename": filename,
14
+ # "start_page": state.get("start_page", 1),
15
+ # "end_page": state.get("end_page", 1)
16
+ # }
17
+ # headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
18
+ # resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers)
19
+
20
+ # if resp.status_code != 200:
21
+ # raise RuntimeError(f"Extract text API failed: {resp.text}")
22
+
23
+ # state["text"] = resp.json().get("text", "")
24
+ # return state
25
+
26
+ import os
27
+ import requests
28
+
29
+ EXTRACT_TEXT_API = "https://point9-extract-text-and-table.hf.space/api/text" # Replace with your space URL
30
+
31
+ def extract_text_remote(state):
32
+ filename = state["filename"]
33
+ path = state["temp_files"][filename]
34
+
35
+ # Get the file extension
36
+ _, file_extension = os.path.splitext(filename)
37
+
38
+ # If the filename in state doesn't have extension, check if it's in the temp_files path
39
+ if not file_extension:
40
+ # Try to get extension from the actual file path
41
+ _, file_extension = os.path.splitext(path)
42
+ if file_extension:
43
+ # Add extension to filename
44
+ filename = f"{filename}{file_extension}"
45
+
46
+ with open(path, "rb") as f:
47
+ files = {"file": (filename, f, "application/pdf")}
48
+ data = {
49
+ "filename": filename, # Now includes extension
50
+ "start_page": state.get("start_page", 1),
51
+ "end_page": state.get("end_page", 1)
52
+ }
53
+ headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
54
+ resp = requests.post(EXTRACT_TEXT_API, files=files, data=data, headers=headers)
55
+
56
+ if resp.status_code != 200:
57
+ raise RuntimeError(f"Extract text API failed: {resp.text}")
58
+
59
+ state["text"] = resp.json().get("text", "")
60
  return state