Spaces:

not-lain
/

utils

Running

App Files Files Community

not-lain commited on Nov 14, 2024

Commit

579432a

1 Parent(s): fac9a75

add ppt text extraction support

Browse files

Files changed (1) hide show

app.py +38 -5

app.py CHANGED Viewed

@@ -20,6 +20,39 @@ def extract_text_from_pptx(file_path):
     return "\n\n".join(text_content)
 def convert_pdf_to_image(file):
     images = convert_from_path(file)
@@ -87,16 +120,16 @@ doc_or_docx_to_text = gr.Interface(
     api_name="doc_or_docx_to_text",
 )
-pptx_to_text = gr.Interface(
-    extract_text_from_pptx,
     gr.File(),
     gr.Textbox(placeholder="Extracted text from PPTX will appear here"),
-    api_name="pptx_to_text",
 )
 demo = gr.TabbedInterface(
-    [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_to_text],
-    ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text", "Extract PPTX Text"],
 )
 demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)

     return "\n\n".join(text_content)
+def extract_text_from_ppt(file_path):
+    try:
+        # Convert PPT to PPTX using unoconv
+        pptx_file_path = os.path.splitext(file_path)[0] + '.pptx'
+        subprocess.run(['unoconv', '-f', 'pptx', file_path], check=True)
+        # Extract text from PPTX
+        presentation = Presentation(pptx_file_path)
+        text_content = []
+        for slide in presentation.slides:
+            slide_text = []
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    slide_text.append(shape.text)
+            text_content.append("\n".join(slide_text))
+        # Remove the converted PPTX file
+        os.remove(pptx_file_path)
+        return "\n\n".join(text_content)
+    except Exception as e:
+        print(f"Error extracting text from PPT file: {e}")
+        return "Error extracting text from PPT file"
+def extract_text_from_ppt_or_pptx(file_path):
+    if file_path.endswith('.pptx'):
+        return extract_text_from_pptx(file_path)
+    elif file_path.endswith('.ppt'):
+        return extract_text_from_ppt(file_path)
+    else:
+        return "Unsupported file type. Please provide a .ppt or .pptx file."
 def convert_pdf_to_image(file):
     images = convert_from_path(file)
     api_name="doc_or_docx_to_text",
 )
+pptx_or_ppt_to_text = gr.Interface(
+    extract_text_from_ppt_or_pptx,
     gr.File(),
     gr.Textbox(placeholder="Extracted text from PPTX will appear here"),
+    api_name="pptx_or_ppt_to_text",
 )
 demo = gr.TabbedInterface(
+    [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text],
+    ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text", "Extract PPTX/PPT Text"],
 )
 demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)