Spaces:
Runtime error
Runtime error
Aryan Jain commited on
Commit ·
cf34c19
1
Parent(s): 5cb75d9
update file client to process word document
Browse files- Dockerfile +11 -4
- src/utils/_file_client.py +41 -15
Dockerfile
CHANGED
|
@@ -13,11 +13,13 @@ WORKDIR /app
|
|
| 13 |
RUN apt-get update && apt-get install -y \
|
| 14 |
curl \
|
| 15 |
build-essential \
|
| 16 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
-
|
| 18 |
-
RUN apt-get update && apt-get install -y \
|
| 19 |
libreoffice \
|
|
|
|
| 20 |
fonts-dejavu \
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
&& apt-get clean \
|
| 22 |
&& rm -rf /var/lib/apt/lists/*
|
| 23 |
|
|
@@ -33,7 +35,12 @@ COPY pyproject.toml poetry.lock* /app/
|
|
| 33 |
RUN poetry config virtualenvs.create false \
|
| 34 |
&& poetry install --no-interaction --no-ansi
|
| 35 |
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
# Copy the entire project
|
| 39 |
COPY . /app/
|
|
|
|
| 13 |
RUN apt-get update && apt-get install -y \
|
| 14 |
curl \
|
| 15 |
build-essential \
|
|
|
|
|
|
|
|
|
|
| 16 |
libreoffice \
|
| 17 |
+
default-jre-headless \
|
| 18 |
fonts-dejavu \
|
| 19 |
+
fonts-liberation \
|
| 20 |
+
fontconfig \
|
| 21 |
+
xvfb \
|
| 22 |
+
dbus-x11 \
|
| 23 |
&& apt-get clean \
|
| 24 |
&& rm -rf /var/lib/apt/lists/*
|
| 25 |
|
|
|
|
| 35 |
RUN poetry config virtualenvs.create false \
|
| 36 |
&& poetry install --no-interaction --no-ansi
|
| 37 |
|
| 38 |
+
RUN mkdir -p /tmp/.config/libreoffice && \
|
| 39 |
+
chmod 755 /tmp/.config/libreoffice
|
| 40 |
+
|
| 41 |
+
ENV HOME=/tmp \
|
| 42 |
+
TMPDIR=/tmp \
|
| 43 |
+
JAVA_HOME=/usr/lib/jvm/default-java
|
| 44 |
|
| 45 |
# Copy the entire project
|
| 46 |
COPY . /app/
|
src/utils/_file_client.py
CHANGED
|
@@ -9,6 +9,7 @@ import aiofiles
|
|
| 9 |
import markdown2
|
| 10 |
import subprocess
|
| 11 |
import tempfile
|
|
|
|
| 12 |
|
| 13 |
class FileClient:
|
| 14 |
def __init__(self):
|
|
@@ -58,31 +59,56 @@ class FileClient:
|
|
| 58 |
data = {
|
| 59 |
"file_type": "Word Document"
|
| 60 |
}
|
|
|
|
|
|
|
|
|
|
| 61 |
async with aiofiles.tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp_docx:
|
| 62 |
tmp_docx_path = tmp_docx.name
|
| 63 |
await tmp_docx.write(file_bytes.getvalue())
|
| 64 |
-
|
| 65 |
-
# await asyncio.to_thread(convert, tmp_docx_path, tmp_pdf_path)
|
| 66 |
-
output_dir = tempfile.gettempdir()
|
| 67 |
env = os.environ.copy()
|
| 68 |
-
env
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
subprocess.run,
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
],
|
| 78 |
-
check=True,
|
| 79 |
-
env=env
|
| 80 |
)
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
| 82 |
async with aiofiles.open(tmp_pdf_path, "rb") as pdf_file:
|
| 83 |
pdf_bytes = await pdf_file.read()
|
|
|
|
| 84 |
final_pdf_bytes = io.BytesIO(pdf_bytes)
|
| 85 |
data["data"] = await self.extract_from_pdf(file_bytes=final_pdf_bytes)
|
|
|
|
| 86 |
return data
|
| 87 |
|
| 88 |
async def extract_from_excel(self, file_bytes: io.BytesIO):
|
|
|
|
| 9 |
import markdown2
|
| 10 |
import subprocess
|
| 11 |
import tempfile
|
| 12 |
+
from pathlib import Path
|
| 13 |
|
| 14 |
class FileClient:
|
| 15 |
def __init__(self):
|
|
|
|
| 59 |
data = {
|
| 60 |
"file_type": "Word Document"
|
| 61 |
}
|
| 62 |
+
temp_home = tempfile.mkdtemp()
|
| 63 |
+
output_dir = tempfile.mkdtemp()
|
| 64 |
+
|
| 65 |
async with aiofiles.tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp_docx:
|
| 66 |
tmp_docx_path = tmp_docx.name
|
| 67 |
await tmp_docx.write(file_bytes.getvalue())
|
| 68 |
+
|
|
|
|
|
|
|
| 69 |
env = os.environ.copy()
|
| 70 |
+
env.update({
|
| 71 |
+
"HOME": temp_home,
|
| 72 |
+
"TMPDIR": output_dir,
|
| 73 |
+
"SAL_USE_VCLPLUGIN": "svp",
|
| 74 |
+
"DISPLAY": ":99",
|
| 75 |
+
})
|
| 76 |
+
|
| 77 |
+
profile_dir = os.path.join(temp_home, ".config", "libreoffice", "4", "user")
|
| 78 |
+
os.makedirs(profile_dir, exist_ok=True)
|
| 79 |
+
|
| 80 |
+
cmd = [
|
| 81 |
+
"libreoffice",
|
| 82 |
+
"--headless",
|
| 83 |
+
"--invisible",
|
| 84 |
+
"--nodefault",
|
| 85 |
+
"--nolockcheck",
|
| 86 |
+
"--nologo",
|
| 87 |
+
"--norestore",
|
| 88 |
+
"--convert-to", "pdf",
|
| 89 |
+
"--outdir", output_dir,
|
| 90 |
+
tmp_docx_path
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
process = await asyncio.to_thread(
|
| 94 |
subprocess.run,
|
| 95 |
+
cmd,
|
| 96 |
+
check=False,
|
| 97 |
+
capture_output=True,
|
| 98 |
+
text=True,
|
| 99 |
+
env=env,
|
| 100 |
+
timeout=60
|
|
|
|
|
|
|
|
|
|
| 101 |
)
|
| 102 |
+
|
| 103 |
+
pdf_filename = os.path.splitext(os.path.basename(tmp_docx_path))[0] + ".pdf"
|
| 104 |
+
tmp_pdf_path = os.path.join(output_dir, pdf_filename)
|
| 105 |
+
|
| 106 |
async with aiofiles.open(tmp_pdf_path, "rb") as pdf_file:
|
| 107 |
pdf_bytes = await pdf_file.read()
|
| 108 |
+
|
| 109 |
final_pdf_bytes = io.BytesIO(pdf_bytes)
|
| 110 |
data["data"] = await self.extract_from_pdf(file_bytes=final_pdf_bytes)
|
| 111 |
+
|
| 112 |
return data
|
| 113 |
|
| 114 |
async def extract_from_excel(self, file_bytes: io.BytesIO):
|