update parameter
Browse files
app.py
CHANGED
|
@@ -269,41 +269,33 @@ def extract_text_from_pdf(file):
|
|
| 269 |
return text
|
| 270 |
|
| 271 |
|
| 272 |
-
def extract_text_from_docx(
|
| 273 |
text = ""
|
| 274 |
-
doc = Document(
|
| 275 |
for paragraph in doc.paragraphs:
|
| 276 |
text += paragraph.text + "\n"
|
| 277 |
return text
|
| 278 |
|
| 279 |
|
| 280 |
-
def convert_doc_to_text(
|
| 281 |
try:
|
| 282 |
subprocess.run(
|
| 283 |
-
["unoconv", "--format", "txt",
|
| 284 |
capture_output=True,
|
| 285 |
text=True,
|
| 286 |
check=True,
|
| 287 |
)
|
| 288 |
-
txt_file_path =
|
| 289 |
with open(txt_file_path, "r") as f:
|
| 290 |
text = f.read()
|
| 291 |
text = text.lstrip("\ufeff")
|
| 292 |
os.remove(txt_file_path)
|
| 293 |
return text
|
| 294 |
except subprocess.CalledProcessError as e:
|
| 295 |
-
print(f"Error converting {
|
| 296 |
return ""
|
| 297 |
|
| 298 |
|
| 299 |
-
def extract_text_from_doc_or_docx(file):
|
| 300 |
-
if file.name.endswith(".docx"):
|
| 301 |
-
return extract_text_from_docx(file)
|
| 302 |
-
elif file.name.endswith(".doc"):
|
| 303 |
-
return convert_doc_to_text(file.name)
|
| 304 |
-
else:
|
| 305 |
-
return "Unsupported file type. Please upload a .doc or .docx file."
|
| 306 |
-
|
| 307 |
|
| 308 |
# function that generates a random string
|
| 309 |
def generate_random_string(length=23):
|
|
@@ -405,12 +397,6 @@ pdf_to_text = gr.Interface(
|
|
| 405 |
api_name="pdf_to_text",
|
| 406 |
)
|
| 407 |
|
| 408 |
-
# doc_or_docx_to_text = gr.Interface(
|
| 409 |
-
# extract_text_from_doc_or_docx,
|
| 410 |
-
# gr.File(),
|
| 411 |
-
# gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"),
|
| 412 |
-
# api_name="doc_or_docx_to_text",
|
| 413 |
-
# )
|
| 414 |
doc_to_text = gr.Interface(
|
| 415 |
convert_doc_to_text,
|
| 416 |
gr.File(),
|
|
@@ -424,13 +410,6 @@ docx_to_text = gr.Interface(
|
|
| 424 |
api_name="docx_to_text"
|
| 425 |
)
|
| 426 |
|
| 427 |
-
# pptx_or_ppt_to_text = gr.Interface(
|
| 428 |
-
# extract_text_from_ppt_or_pptx,
|
| 429 |
-
# gr.File(),
|
| 430 |
-
# gr.Textbox(placeholder="Extracted text from PPTX will appear here"),
|
| 431 |
-
# api_name="pptx_or_ppt_to_text",
|
| 432 |
-
# )
|
| 433 |
-
|
| 434 |
ppt_to_text = gr.Interface(
|
| 435 |
extract_text_from_ppt,
|
| 436 |
gr.File(),
|
|
|
|
| 269 |
return text
|
| 270 |
|
| 271 |
|
| 272 |
+
def extract_text_from_docx(file_path):
|
| 273 |
text = ""
|
| 274 |
+
doc = Document(file_path.name)
|
| 275 |
for paragraph in doc.paragraphs:
|
| 276 |
text += paragraph.text + "\n"
|
| 277 |
return text
|
| 278 |
|
| 279 |
|
| 280 |
+
def convert_doc_to_text(file_path):
|
| 281 |
try:
|
| 282 |
subprocess.run(
|
| 283 |
+
["unoconv", "--format", "txt", file_path],
|
| 284 |
capture_output=True,
|
| 285 |
text=True,
|
| 286 |
check=True,
|
| 287 |
)
|
| 288 |
+
txt_file_path = file_path.replace(".doc", ".txt")
|
| 289 |
with open(txt_file_path, "r") as f:
|
| 290 |
text = f.read()
|
| 291 |
text = text.lstrip("\ufeff")
|
| 292 |
os.remove(txt_file_path)
|
| 293 |
return text
|
| 294 |
except subprocess.CalledProcessError as e:
|
| 295 |
+
print(f"Error converting {file_path} to text: {e}")
|
| 296 |
return ""
|
| 297 |
|
| 298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
# function that generates a random string
|
| 301 |
def generate_random_string(length=23):
|
|
|
|
| 397 |
api_name="pdf_to_text",
|
| 398 |
)
|
| 399 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
doc_to_text = gr.Interface(
|
| 401 |
convert_doc_to_text,
|
| 402 |
gr.File(),
|
|
|
|
| 410 |
api_name="docx_to_text"
|
| 411 |
)
|
| 412 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
ppt_to_text = gr.Interface(
|
| 414 |
extract_text_from_ppt,
|
| 415 |
gr.File(),
|