Aryan Jain commited on
Commit
cf34c19
·
1 Parent(s): 5cb75d9

update file client to process word document

Browse files
Files changed (2) hide show
  1. Dockerfile +11 -4
  2. src/utils/_file_client.py +41 -15
Dockerfile CHANGED
@@ -13,11 +13,13 @@ WORKDIR /app
13
  RUN apt-get update && apt-get install -y \
14
  curl \
15
  build-essential \
16
- && rm -rf /var/lib/apt/lists/*
17
-
18
- RUN apt-get update && apt-get install -y \
19
  libreoffice \
 
20
  fonts-dejavu \
 
 
 
 
21
  && apt-get clean \
22
  && rm -rf /var/lib/apt/lists/*
23
 
@@ -33,7 +35,12 @@ COPY pyproject.toml poetry.lock* /app/
33
  RUN poetry config virtualenvs.create false \
34
  && poetry install --no-interaction --no-ansi
35
 
36
- ENV HOME=/tmp
 
 
 
 
 
37
 
38
  # Copy the entire project
39
  COPY . /app/
 
13
  RUN apt-get update && apt-get install -y \
14
  curl \
15
  build-essential \
 
 
 
16
  libreoffice \
17
+ default-jre-headless \
18
  fonts-dejavu \
19
+ fonts-liberation \
20
+ fontconfig \
21
+ xvfb \
22
+ dbus-x11 \
23
  && apt-get clean \
24
  && rm -rf /var/lib/apt/lists/*
25
 
 
35
  RUN poetry config virtualenvs.create false \
36
  && poetry install --no-interaction --no-ansi
37
 
38
+ RUN mkdir -p /tmp/.config/libreoffice && \
39
+ chmod 755 /tmp/.config/libreoffice
40
+
41
+ ENV HOME=/tmp \
42
+ TMPDIR=/tmp \
43
+ JAVA_HOME=/usr/lib/jvm/default-java
44
 
45
  # Copy the entire project
46
  COPY . /app/
src/utils/_file_client.py CHANGED
@@ -9,6 +9,7 @@ import aiofiles
9
  import markdown2
10
  import subprocess
11
  import tempfile
 
12
 
13
  class FileClient:
14
  def __init__(self):
@@ -58,31 +59,56 @@ class FileClient:
58
  data = {
59
  "file_type": "Word Document"
60
  }
 
 
 
61
  async with aiofiles.tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp_docx:
62
  tmp_docx_path = tmp_docx.name
63
  await tmp_docx.write(file_bytes.getvalue())
64
- tmp_pdf_path = tmp_docx_path.replace(".docx", ".pdf")
65
- # await asyncio.to_thread(convert, tmp_docx_path, tmp_pdf_path)
66
- output_dir = tempfile.gettempdir()
67
  env = os.environ.copy()
68
- env["HOME"] = tempfile.mkdtemp()
69
- await asyncio.to_thread(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  subprocess.run,
71
- [
72
- "libreoffice",
73
- "--headless",
74
- "--convert-to", "pdf",
75
- "--outdir", output_dir,
76
- tmp_docx_path
77
- ],
78
- check=True,
79
- env=env
80
  )
81
- tmp_pdf_path = os.path.join(output_dir, os.path.splitext(os.path.basename(tmp_docx_path))[0] + ".pdf")
 
 
 
82
  async with aiofiles.open(tmp_pdf_path, "rb") as pdf_file:
83
  pdf_bytes = await pdf_file.read()
 
84
  final_pdf_bytes = io.BytesIO(pdf_bytes)
85
  data["data"] = await self.extract_from_pdf(file_bytes=final_pdf_bytes)
 
86
  return data
87
 
88
  async def extract_from_excel(self, file_bytes: io.BytesIO):
 
9
  import markdown2
10
  import subprocess
11
  import tempfile
12
+ from pathlib import Path
13
 
14
  class FileClient:
15
  def __init__(self):
 
59
  data = {
60
  "file_type": "Word Document"
61
  }
62
+ temp_home = tempfile.mkdtemp()
63
+ output_dir = tempfile.mkdtemp()
64
+
65
  async with aiofiles.tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp_docx:
66
  tmp_docx_path = tmp_docx.name
67
  await tmp_docx.write(file_bytes.getvalue())
68
+
 
 
69
  env = os.environ.copy()
70
+ env.update({
71
+ "HOME": temp_home,
72
+ "TMPDIR": output_dir,
73
+ "SAL_USE_VCLPLUGIN": "svp",
74
+ "DISPLAY": ":99",
75
+ })
76
+
77
+ profile_dir = os.path.join(temp_home, ".config", "libreoffice", "4", "user")
78
+ os.makedirs(profile_dir, exist_ok=True)
79
+
80
+ cmd = [
81
+ "libreoffice",
82
+ "--headless",
83
+ "--invisible",
84
+ "--nodefault",
85
+ "--nolockcheck",
86
+ "--nologo",
87
+ "--norestore",
88
+ "--convert-to", "pdf",
89
+ "--outdir", output_dir,
90
+ tmp_docx_path
91
+ ]
92
+
93
+ process = await asyncio.to_thread(
94
  subprocess.run,
95
+ cmd,
96
+ check=False,
97
+ capture_output=True,
98
+ text=True,
99
+ env=env,
100
+ timeout=60
 
 
 
101
  )
102
+
103
+ pdf_filename = os.path.splitext(os.path.basename(tmp_docx_path))[0] + ".pdf"
104
+ tmp_pdf_path = os.path.join(output_dir, pdf_filename)
105
+
106
  async with aiofiles.open(tmp_pdf_path, "rb") as pdf_file:
107
  pdf_bytes = await pdf_file.read()
108
+
109
  final_pdf_bytes = io.BytesIO(pdf_bytes)
110
  data["data"] = await self.extract_from_pdf(file_bytes=final_pdf_bytes)
111
+
112
  return data
113
 
114
  async def extract_from_excel(self, file_bytes: io.BytesIO):