Spaces:
Sleeping
Sleeping
UPDATE: ThreadPoolExecutor
Browse files- functions.py +4 -4
functions.py
CHANGED
|
@@ -292,13 +292,13 @@ def getLinks(url: str, timeout = 30):
|
|
| 292 |
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
| 293 |
|
| 294 |
|
| 295 |
-
def getText(image):
|
| 296 |
-
global reader
|
| 297 |
-
return "\n".join([text[1] for text in reader.readtext(np.array(image.resize((500, 500))), paragraph=True)])
|
| 298 |
|
| 299 |
def getTextFromImagePDF(pdfBytes):
|
|
|
|
|
|
|
|
|
|
| 300 |
allImages = convert_from_bytes(pdfBytes)
|
| 301 |
-
with ThreadPoolExecutor(max_workers =
|
| 302 |
texts = list(p.map(getText, allImages))
|
| 303 |
return "\n\n\n".join(texts)
|
| 304 |
|
|
|
|
| 292 |
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
| 293 |
|
| 294 |
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
def getTextFromImagePDF(pdfBytes):
|
| 297 |
+
def getText(image):
|
| 298 |
+
global reader
|
| 299 |
+
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
| 300 |
allImages = convert_from_bytes(pdfBytes)
|
| 301 |
+
with ThreadPoolExecutor(max_workers = 32) as p:
|
| 302 |
texts = list(p.map(getText, allImages))
|
| 303 |
return "\n\n\n".join(texts)
|
| 304 |
|