Feriel080 commited on
Commit
72d2612
·
verified ·
1 Parent(s): 5f70924
Files changed (1) hide show
  1. utils.py +32 -174
utils.py CHANGED
@@ -1,24 +1,14 @@
1
  from pptx import Presentation
 
2
  import pdfplumber
3
  from reportlab.lib.pagesizes import letter
4
- from reportlab.pdfgen import canvas
5
- from io import BytesIO
6
- import docx
7
  from pathlib import Path
8
- import openpyxl
9
  import re
10
-
11
- from fastapi import UploadFile
12
- from docx import Document
13
  import pandas as pd
14
  import pdfplumber
15
- from docx import Document as DocxDocument
16
- from pptx.util import Inches, Pt
17
- from reportlab.lib.pagesizes import letter
18
- from reportlab.pdfgen import canvas
19
- import os
20
- from io import BytesIO
21
- from openpyxl import Workbook
22
 
23
 
24
  def extract_text(file_path: Path, file_type: str) -> str:
@@ -29,104 +19,68 @@ def extract_text(file_path: Path, file_type: str) -> str:
29
  text = f.read()
30
 
31
  elif file_type == "docx":
32
- doc = docx.Document(file_path)
33
- text = "\n".join([para.text for para in doc.paragraphs if para.text])
34
 
35
  elif file_type == "xlsx":
36
- wb = openpyxl.load_workbook(file_path)
37
- sheet = wb.active
38
- for row in sheet.rows:
39
- for cell in row:
40
- if cell.value is not None:
41
- text += str(cell.value) + " "
42
 
43
  elif file_type == "pptx":
44
  prs = Presentation(file_path)
 
45
  for slide in prs.slides:
46
  for shape in slide.shapes:
47
  if shape.has_text_frame:
48
- for paragraph in shape.text_frame.paragraphs:
49
- if (clean_text := paragraph.text.strip()):
50
- text += clean_text + "\n"
51
-
52
  elif shape.has_table:
53
  for row in shape.table.rows:
54
  for cell in row.cells:
55
- if (cell_text := cell.text.strip()):
56
- text += cell_text + "\n"
 
 
57
 
58
-
59
  elif file_type == "pdf":
60
  with pdfplumber.open(file_path) as pdf:
61
- text = "\n".join(
62
- page.extract_text()
63
- for page in pdf.pages
64
- if page.extract_text()
65
- )
66
 
67
- return text.strip()
68
 
69
- def save_file(text: str, original_path: Path, file_type: str, output_path: Path):
70
  if file_type == "docx":
71
- doc = docx.Document()
72
  doc.add_paragraph(text)
73
  doc.save(output_path)
74
 
75
  elif file_type == "xlsx":
76
- wb = openpyxl.Workbook()
77
- sheet = wb.active
78
- text_lines = text.split(
79
- "\n"
80
- )
81
- for i, line in enumerate(text_lines, start=1):
82
- sheet.cell(row=i, column=1, value=line)
83
- wb.save(output_path)
84
 
85
  elif file_type == "pptx":
86
  prs = Presentation()
87
  slide_layout = prs.slide_layouts[1]
88
-
89
- max_lines = 25
90
  text_lines = text.split('\n')
91
- chunks = []
92
- current_chunk = []
93
-
94
- for line in text_lines:
95
- current_chunk.append(line)
96
- if len(current_chunk) >= max_lines:
97
- chunks.append('\n'.join(current_chunk))
98
- current_chunk = []
99
- if current_chunk:
100
- chunks.append('\n'.join(current_chunk))
101
 
102
  for chunk in chunks:
103
  slide = prs.slides.add_slide(slide_layout)
104
  content = slide.shapes.placeholders[1]
105
-
106
  text_frame = content.text_frame
107
-
108
- text_frame.clear()
109
-
110
- paragraph = text_frame.add_paragraph()
111
- paragraph.text = chunk
112
- paragraph.font.size = Pt(13)
113
 
114
  prs.save(output_path)
115
 
116
  elif file_type == "pdf":
117
- with open(output_path, "wb") as f:
118
- pdf_buffer = BytesIO()
119
- c = canvas.Canvas(pdf_buffer, pagesize=letter)
120
- text_lines = text.split("\n")
121
- y = 750
122
- for line in text_lines:
123
- c.drawString(72, y, line)
124
- y -= 12
125
- if y < 50:
126
- c.showPage()
127
- y = 750
128
- c.save()
129
- f.write(pdf_buffer.getvalue())
130
 
131
  else:
132
  with open(output_path, "w", encoding="utf-8") as f:
@@ -158,24 +112,17 @@ def verify_summary(summary: str, original: str) -> str:
158
  return '. '.join(verified) if verified else summary[:500]
159
 
160
  def ensure_complete_sentences(text: str) -> str:
161
- """Guarantees proper sentence structure with robust error handling"""
162
  if not text or not isinstance(text, str):
163
  return ""
164
 
165
  try:
166
- # Normalize whitespace
167
  text = ' '.join(text.split())
168
-
169
- # Split on sentence boundaries
170
  sentences = re.split(r'(?<=[.!?])\s+', text)
171
-
172
- # Filter and validate sentences
173
  valid_sentences = [
174
  s.strip() for s in sentences
175
  if s.strip() and s[-1] in {'.', '!', '?'}
176
  ]
177
 
178
- # Reconstruct text with proper spacing
179
  reconstructed = ' '.join(valid_sentences)
180
 
181
  # Final safety check
@@ -193,93 +140,4 @@ def ensure_complete_sentences(text: str) -> str:
193
  return reconstructed
194
 
195
  except Exception:
196
- return text
197
-
198
-
199
- async def convert_to_text(file: UploadFile) -> str:
200
- file_extension = file.filename.split(".")[-1].lower()
201
- content = await file.read()
202
-
203
- if file_extension == "txt":
204
- return content.decode("utf-8")
205
-
206
- elif file_extension == "docx":
207
- doc = Document(BytesIO(content))
208
- return "\n".join([para.text for para in doc.paragraphs])
209
-
210
- elif file_extension == "pptx":
211
- ppt = Presentation(BytesIO(content))
212
- text = []
213
- for slide in ppt.slides:
214
- for shape in slide.shapes:
215
- if hasattr(shape, "text"):
216
- text.append(shape.text)
217
- return "\n".join(text)
218
-
219
- elif file_extension == "pdf":
220
- with pdfplumber.open(BytesIO(content)) as pdf:
221
- return "\n".join([page.extract_text() for page in pdf.pages])
222
-
223
- elif file_extension in ["xlsx", "xls"]:
224
- file_like = BytesIO(content)
225
- df = pd.read_excel(file_like)
226
- return df.to_string()
227
-
228
- else:
229
- raise ValueError(f"Unsupported file type: {file_extension}")
230
-
231
-
232
- # save translated text to a file
233
- def save_translated_file(translated_text: str, original_filename: str) -> str:
234
- file_extension = os.path.splitext(original_filename)[-1].lower()
235
- output_dir = "translated_files"
236
- os.makedirs(output_dir, exist_ok=True)
237
-
238
- output_filename = f"translated_{os.path.splitext(original_filename)[0]}{file_extension}"
239
- output_file_path = os.path.join(output_dir, output_filename)
240
-
241
- if file_extension == ".docx":
242
- doc = DocxDocument()
243
- doc.add_paragraph(translated_text)
244
- doc.save(output_file_path)
245
- elif file_extension == ".pdf":
246
- with open(output_file_path, "wb") as f: # create new pdf
247
- pdf_buffer = BytesIO()
248
- c = canvas.Canvas(pdf_buffer, pagesize=letter)
249
- text_lines = translated_text.split("\n")
250
- y = 750 # Position verticale initiale
251
- for line in text_lines:
252
- c.drawString(72, y, line)
253
- y -= 12
254
- if y < 50:
255
- c.showPage()
256
- y = 750
257
- c.save()
258
- f.write(pdf_buffer.getvalue())
259
- elif file_extension == ".pptx":
260
- prs = Presentation()
261
- slide = prs.slides.add_slide(prs.slide_layouts[5])
262
- left = top = width = height = Inches(1)
263
- txBox = slide.shapes.add_textbox(left, top, width, height)
264
- tf = txBox.text_frame
265
- tf.text = translated_text
266
-
267
- prs.save(output_file_path)
268
- elif file_extension in [".xlsx", ".xls"]:
269
- if file_extension == ".xlsx":
270
- wb = Workbook()
271
- ws = wb.active
272
- text_lines = translated_text.split("\n")
273
- for i, line in enumerate(text_lines, start=1):
274
- ws.cell(row=i, column=1, value=line)
275
- wb.save(output_file_path)
276
- else:
277
- df = pd.DataFrame([translated_text.split("\n")])
278
- df.to_excel(output_file_path, index=False, header=False)
279
- else:
280
- output_filename = f"translated_{os.path.splitext(original_filename)[0]}.txt"
281
- output_file_path = os.path.join(output_dir, output_filename)
282
- with open(output_file_path, "w", encoding="utf-8") as f:
283
- f.write(translated_text)
284
-
285
- return output_file_path
 
1
  from pptx import Presentation
2
+ from docx import Document
3
  import pdfplumber
4
  from reportlab.lib.pagesizes import letter
 
 
 
5
  from pathlib import Path
 
6
  import re
 
 
 
7
  import pandas as pd
8
  import pdfplumber
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from reportlab.platypus import SimpleDocTemplate, Paragraph
11
+ from reportlab.lib.styles import getSampleStyleSheet
 
 
 
 
12
 
13
 
14
  def extract_text(file_path: Path, file_type: str) -> str:
 
19
  text = f.read()
20
 
21
  elif file_type == "docx":
22
+ doc = Document(file_path)
23
+ return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
24
 
25
  elif file_type == "xlsx":
26
+ df = pd.read_excel(file_path, engine="openpyxl")
27
+ return df.to_string(index=False, header=False).strip()
 
 
 
 
28
 
29
  elif file_type == "pptx":
30
  prs = Presentation(file_path)
31
+ text_parts = []
32
  for slide in prs.slides:
33
  for shape in slide.shapes:
34
  if shape.has_text_frame:
35
+ text_parts.append(shape.text_frame.text.strip())
 
 
 
36
  elif shape.has_table:
37
  for row in shape.table.rows:
38
  for cell in row.cells:
39
+ if cell.text.strip():
40
+ text_parts.append(cell.text.strip())
41
+
42
+ return "\n".join(text_parts)
43
 
 
44
  elif file_type == "pdf":
45
  with pdfplumber.open(file_path) as pdf:
46
+ def extract_page(page):
47
+ return page.extract_text_simple() or ""
48
+ with ThreadPoolExecutor() as executor:
49
+ text_parts = list(executor.map(extract_page, pdf.pages))
50
+ return "\n".join(part for part in text_parts if part).strip()
51
 
52
+ return text
53
 
54
+ def save_file(text: str, file_type: str, output_path: Path):
55
  if file_type == "docx":
56
+ doc = Document()
57
  doc.add_paragraph(text)
58
  doc.save(output_path)
59
 
60
  elif file_type == "xlsx":
61
+ df = pd.DataFrame(text.split("\n"), columns=["Content"])
62
+ df.to_excel(output_path, index=False, engine="xlsxwriter")
 
 
 
 
 
 
63
 
64
  elif file_type == "pptx":
65
  prs = Presentation()
66
  slide_layout = prs.slide_layouts[1]
 
 
67
  text_lines = text.split('\n')
68
+ chunks = [text_lines[i:i+25] for i in range(0, len(text_lines), 25)]
 
 
 
 
 
 
 
 
 
69
 
70
  for chunk in chunks:
71
  slide = prs.slides.add_slide(slide_layout)
72
  content = slide.shapes.placeholders[1]
 
73
  text_frame = content.text_frame
74
+ text_frame.clear()
75
+ text_frame.text = "\n".join(chunk)
 
 
 
 
76
 
77
  prs.save(output_path)
78
 
79
  elif file_type == "pdf":
80
+ doc = SimpleDocTemplate(str(output_path), pagesize=letter)
81
+ styles = getSampleStyleSheet()
82
+ flowables = [Paragraph(line, styles["Normal"]) for line in text.split("\n") if line.strip()]
83
+ doc.build(flowables)
 
 
 
 
 
 
 
 
 
84
 
85
  else:
86
  with open(output_path, "w", encoding="utf-8") as f:
 
112
  return '. '.join(verified) if verified else summary[:500]
113
 
114
  def ensure_complete_sentences(text: str) -> str:
 
115
  if not text or not isinstance(text, str):
116
  return ""
117
 
118
  try:
 
119
  text = ' '.join(text.split())
 
 
120
  sentences = re.split(r'(?<=[.!?])\s+', text)
 
 
121
  valid_sentences = [
122
  s.strip() for s in sentences
123
  if s.strip() and s[-1] in {'.', '!', '?'}
124
  ]
125
 
 
126
  reconstructed = ' '.join(valid_sentences)
127
 
128
  # Final safety check
 
140
  return reconstructed
141
 
142
  except Exception:
143
+ return text