Spaces:

MicroHealth
/

auto-wiki

Paused

App Files Files Community

bluenevus commited on Apr 26, 2025

Commit

1bb1cee

verified ·

1 Parent(s): fe02558

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -62

app.py CHANGED Viewed

@@ -4,13 +4,9 @@ import os
 import zipfile
 from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
 import dash_bootstrap_components as dbc
-from docx import Document
-from docx.enum.style import WD_STYLE_TYPE
-import markdown
 import threading
 import time
-import PyPDF2
-import re
 app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
@@ -42,70 +38,22 @@ app.layout = dbc.Container([
     dcc.Download(id="download-zip")
 ])
-def process_docx(contents, filename):
     content_type, content_string = contents.split(',')
     decoded = base64.b64decode(content_string)
-    doc = Document(io.BytesIO(decoded))
-    full_text = []
-    for para in doc.paragraphs:
-        if para.style.name.startswith('Heading'):
-            level = int(para.style.name[-1])
-            full_text.append(f"{'#' * level} {para.text}")
-        else:
-            text = para.text
-            for run in para.runs:
-                if run.bold:
-                    text = text.replace(run.text, f"**{run.text}**")
-                if run.italic:
-                    text = text.replace(run.text, f"*{run.text}*")
-            if para.style.name == 'List Bullet':
-                full_text.append(f"- {text}")
-            elif para.style.name == 'List Number':
-                full_text.append(f"1. {text}")
-            else:
-                full_text.append(text)
-    return '\n\n'.join(full_text)
-def process_pdf(contents, filename):
-    content_type, content_string = contents.split(',')
-    decoded = base64.b64decode(content_string)
-    pdf_file = io.BytesIO(decoded)
-    pdf_reader = PyPDF2.PdfReader(pdf_file)
-    full_text = []
-    for page in pdf_reader.pages:
-        text = page.extract_text()
-        # Basic formatting detection (this is a simplified approach and may not catch all formatting)
-        text = re.sub(r'\*\*(.*?)\*\*', r'**\1**', text)  # Bold
-        text = re.sub(r'_(.*?)_', r'*\1*', text)  # Italic
-        text = re.sub(r'^(\d+\.)\s', r'\1 ', text, flags=re.MULTILINE)  # Numbered lists
-        text = re.sub(r'^[•●○]\s', '- ', text, flags=re.MULTILINE)  # Bullet points
-        # Detect potential headers (simplified approach)
-        lines = text.split('\n')
-        for i, line in enumerate(lines):
-            if i == 0 or (i > 0 and len(line) < 50 and line.strip() and line.strip()[0].isupper()):
-                lines[i] = f"## {line}"
-        full_text.append('\n'.join(lines))
-    return '\n\n'.join(full_text)
 def process_files(contents, filenames):
     processed_files = []
     for c, n in zip(contents, filenames):
-        if n.lower().endswith('.docx'):
-            text = process_docx(c, n)
-        elif n.lower().endswith('.pdf'):
-            text = process_pdf(c, n)
-        else:
-            continue  # Skip unsupported file types
-        md = markdown.markdown(text)
-        processed_files.append((n.replace('.docx', '.md').replace('.pdf', '.md'), md))
         time.sleep(0.1)  # Simulate processing time
     zip_buffer = io.BytesIO()

 import zipfile
 from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
 import dash_bootstrap_components as dbc
 import threading
 import time
+import pypandoc
 app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
     dcc.Download(id="download-zip")
 ])
+def process_file(contents, filename):
     content_type, content_string = contents.split(',')
     decoded = base64.b64decode(content_string)
+    with open(filename, 'wb') as f:
+        f.write(decoded)
+    md_content = pypandoc.convert_file(filename, 'md')
+    os.remove(filename)  # Clean up the temporary file
+    return md_content
 def process_files(contents, filenames):
     processed_files = []
     for c, n in zip(contents, filenames):
+        if n.lower().endswith(('.docx', '.pdf')):
+            text = process_file(c, n)
+            processed_files.append((n.rsplit('.', 1)[0] + '.md', text))
         time.sleep(0.1)  # Simulate processing time
     zip_buffer = io.BytesIO()