DaniFera commited on
Commit
5f371f8
·
verified ·
1 Parent(s): 24b0c95

Update core.py

Browse files
Files changed (1) hide show
  1. core.py +121 -36
core.py CHANGED
@@ -1,26 +1,27 @@
1
- # Versión 2.3: Core con Comparación Palabra por Palabra (Precisión Alta)
2
- # Autor: Gemini (AI Assistant)
3
-
4
  import os
5
  import zipfile
6
  import uuid
7
  import subprocess
8
  import difflib
 
9
  import pdfplumber
10
  import pandas as pd
11
- from pypdf import PdfWriter, PdfReader
12
  from pdf2image import convert_from_path
13
  from pdf2docx import Converter
14
  from PIL import Image
15
  from pptx import Presentation
16
  from pptx.util import Inches
17
 
18
- # ReportLab para generar el PDF de diferencias
19
- from reportlab.lib.pagesizes import A4
20
  from reportlab.lib import colors
21
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
22
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
23
  from reportlab.lib.enums import TA_JUSTIFY
 
 
24
 
25
  from config import TEMP_DIR
26
 
@@ -102,20 +103,127 @@ class PDFEngine:
102
  return sorted(list(set(key_pages)))
103
 
104
  # --- CORE PDF TOOLS ---
105
- def merge_pdfs(self, file_paths: list, order_indices: list = None) -> str:
 
106
  if not file_paths: raise ValueError("No hay archivos.")
 
 
107
  ordered = []
108
  if order_indices and len(order_indices) == len(file_paths):
109
  try: ordered = [file_paths[int(i)] for i in order_indices]
110
  except: ordered = file_paths
111
  else: ordered = file_paths
 
 
112
  m = PdfWriter()
113
  for p in ordered: m.append(p)
114
- out = self._get_output_path("unido.pdf")
115
- with open(out, "wb") as f: m.write(f)
116
- m.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  return out
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  def split_pdf_custom(self, file_path: str, range_str: str) -> str:
120
  if not file_path: raise ValueError("Falta archivo.")
121
  r = PdfReader(file_path)
@@ -208,12 +316,8 @@ class PDFEngine:
208
  return out
209
  except Exception as e: raise RuntimeError(f"Error: {e}")
210
 
211
- # --- COMPARACIÓN DE TEXTO PALABRA POR PALABRA (v2.3) ---
212
  def compare_pdfs_text(self, path_a: str, path_b: str) -> str:
213
- """
214
- Compara el TEXTO de dos PDFs tokenizando por PALABRAS.
215
- Esto evita que líneas enteras se marquen como erróneas si solo cambia una palabra.
216
- """
217
  if not path_a or not path_b: raise ValueError("Faltan archivos.")
218
 
219
  def get_all_words(path):
@@ -223,24 +327,18 @@ class PDFEngine:
223
  for page in reader.pages:
224
  extracted = page.extract_text()
225
  if extracted: text += extracted + " "
226
- # Tokenizar por palabras (split elimina espacios extra y saltos de línea)
227
  return text.split()
228
  except Exception as e:
229
  raise RuntimeError(f"Error leyendo PDF: {e}")
230
 
231
- # Obtenemos listas de palabras ['La', 'casa', 'es', 'roja'...]
232
  words_a = get_all_words(path_a)
233
  words_b = get_all_words(path_b)
234
-
235
- # Calculamos diferencias palabra por palabra
236
  diff = difflib.ndiff(words_a, words_b)
237
 
238
- # Generar PDF
239
  output_path = self._get_output_path("informe_diferencias_palabras.pdf")
240
  doc = SimpleDocTemplate(output_path, pagesize=A4)
241
  styles = getSampleStyleSheet()
242
 
243
- # Estilo para el cuerpo del texto
244
  style_body = ParagraphStyle(
245
  'Body',
246
  parent=styles['BodyText'],
@@ -253,57 +351,44 @@ class PDFEngine:
253
  story.append(Paragraph("Informe de Comparación (Modo Palabras)", styles['Heading1']))
254
  story.append(Spacer(1, 12))
255
 
256
- # Leyenda
257
  legend = '<b>Leyenda:</b> <font color="red"><strike>Eliminado</strike></font> | <font color="green"><b>Añadido</b></font> | Texto Común'
258
  story.append(Paragraph(legend, style_body))
259
  story.append(Spacer(1, 12))
260
  story.append(Paragraph(f"<b>A:</b> {os.path.basename(path_a)} | <b>B:</b> {os.path.basename(path_b)}", style_body))
261
  story.append(Spacer(1, 12))
262
 
263
- # Reconstrucción del texto
264
- # Acumularemos fragmentos HTML para crear párrafos.
265
- # ReportLab tiene límites de tamaño por párrafo, así que hacemos "flush" cada cierto tiempo.
266
-
267
  current_html = ""
268
  word_count = 0
269
 
270
  for token in diff:
271
  code = token[:2]
272
  word = token[2:]
273
-
274
- # Escapar caracteres especiales XML
275
  safe_word = word.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
276
 
277
  chunk = ""
278
  if code == '- ':
279
- # Eliminado (Rojo + Tachado)
280
  chunk = f'<font color="red"><strike>{safe_word}</strike></font> '
281
  elif code == '+ ':
282
- # Añadido (Verde + Negrita)
283
  chunk = f'<font color="green"><b>{safe_word}</b></font> '
284
  elif code == ' ':
285
- # Igual (Negro)
286
  chunk = f'{safe_word} '
287
- # Ignoramos líneas '?'
288
 
289
  current_html += chunk
290
  word_count += 1
291
 
292
- # Crear un nuevo párrafo cada ~300 palabras para evitar problemas de renderizado
293
- if word_count > 300 and code == ' ': # Cortar preferiblemente en texto normal
294
  story.append(Paragraph(current_html, style_body))
295
  story.append(Spacer(1, 6))
296
  current_html = ""
297
  word_count = 0
298
 
299
- # Añadir el resto
300
  if current_html:
301
  story.append(Paragraph(current_html, style_body))
302
 
303
  doc.build(story)
304
  return output_path
305
 
306
- # --- CONVERSIONES OFFICE (v2.0) ---
307
  def pdf_to_excel(self, f):
308
  if not f: raise ValueError("Falta archivo.")
309
  try:
 
1
+ # Versión 2.4
 
 
2
  import os
3
  import zipfile
4
  import uuid
5
  import subprocess
6
  import difflib
7
+ import io
8
  import pdfplumber
9
  import pandas as pd
10
+ from pypdf import PdfWriter, PdfReader, Transformation
11
  from pdf2image import convert_from_path
12
  from pdf2docx import Converter
13
  from PIL import Image
14
  from pptx import Presentation
15
  from pptx.util import Inches
16
 
17
+ # ReportLab para generar PDFs (Informes y Capas de texto)
18
+ from reportlab.lib.pagesizes import A4, letter
19
  from reportlab.lib import colors
20
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
21
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
22
  from reportlab.lib.enums import TA_JUSTIFY
23
+ from reportlab.pdfgen import canvas
24
+ from reportlab.lib.units import inch
25
 
26
  from config import TEMP_DIR
27
 
 
103
  return sorted(list(set(key_pages)))
104
 
105
  # --- CORE PDF TOOLS ---
106
+
107
+ def merge_pdfs(self, file_paths: list, order_indices: list = None, use_numbering: bool = False) -> str:
108
  if not file_paths: raise ValueError("No hay archivos.")
109
+
110
+ # 1. Ordenar
111
  ordered = []
112
  if order_indices and len(order_indices) == len(file_paths):
113
  try: ordered = [file_paths[int(i)] for i in order_indices]
114
  except: ordered = file_paths
115
  else: ordered = file_paths
116
+
117
+ # 2. Unir
118
  m = PdfWriter()
119
  for p in ordered: m.append(p)
120
+
121
+ temp_out = self._get_output_path("temp_unido.pdf")
122
+ with open(temp_out, "wb") as f: m.write(f)
123
+
124
+ # 3. Numerar (Si se solicitó)
125
+ if use_numbering:
126
+ final_out = self._add_page_numbers(temp_out)
127
+ # Intentar borrar el intermedio para no acumular basura (aunque el cron lo haría)
128
+ try: os.remove(temp_out)
129
+ except: pass
130
+ return final_out
131
+
132
+ return temp_out
133
+
134
+ def _add_page_numbers(self, file_path: str) -> str:
135
+ reader = PdfReader(file_path)
136
+ writer = PdfWriter()
137
+ num_pages = len(reader.pages)
138
+
139
+ for i, page in enumerate(reader.pages):
140
+ # Crear un PDF en memoria con solo el número de página
141
+ packet = io.BytesIO()
142
+ # Usamos canvas de reportlab
143
+ can = canvas.Canvas(packet, pagesize=letter)
144
+
145
+ # Obtener tamaño de la página original para centrar el número
146
+ page_width = float(page.mediabox.width)
147
+
148
+ # Dibujar texto: "Página X de Y" en el centro inferior
149
+ text = f"Página {i+1} de {num_pages}"
150
+ can.setFont("Helvetica", 10)
151
+ can.drawCentredString(page_width / 2.0, 20, text) # 20 puntos desde abajo
152
+ can.save()
153
+
154
+ # Mover al inicio del buffer
155
+ packet.seek(0)
156
+ new_pdf = PdfReader(packet)
157
+
158
+ # Fusionar la capa del número con la página original
159
+ page.merge_page(new_pdf.pages[0])
160
+ writer.add_page(page)
161
+
162
+ out = self._get_output_path("unido_numerado.pdf")
163
+ with open(out, "wb") as f: writer.write(f)
164
  return out
165
 
166
+ def add_watermark(self, file_path: str, text: str) -> str:
167
+ if not file_path or not text: raise ValueError("Falta archivo o texto.")
168
+
169
+ reader = PdfReader(file_path)
170
+ writer = PdfWriter()
171
+
172
+ # Crear la marca de agua en memoria una sola vez
173
+ packet = io.BytesIO()
174
+ can = canvas.Canvas(packet, pagesize=letter)
175
+ can.setFont("Helvetica-Bold", 50)
176
+ can.setFillColorRGB(0.5, 0.5, 0.5, 0.3) # Gris semi-transparente
177
+
178
+ # Guardar estado, rotar y escribir en el centro (aprox)
179
+ can.saveState()
180
+ can.translate(300, 400) # Mover origen al centro aprox
181
+ can.rotate(45) # Rotar 45 grados
182
+ can.drawCentredString(0, 0, text)
183
+ can.restoreState()
184
+ can.save()
185
+
186
+ packet.seek(0)
187
+ watermark_pdf = PdfReader(packet)
188
+ watermark_page = watermark_pdf.pages[0]
189
+
190
+ # Aplicar a todas las páginas
191
+ for page in reader.pages:
192
+ # Es necesario clonar el objeto watermark para cada pagina o pypdf se queja a veces
193
+ page.merge_page(watermark_page)
194
+ writer.add_page(page)
195
+
196
+ out = self._get_output_path("marca_agua.pdf")
197
+ with open(out, "wb") as f: writer.write(f)
198
+ return out
199
+
200
+ # --- NUEVO: Reparador (Ghostscript) ---
201
+ def repair_pdf(self, file_path: str) -> str:
202
+ if not file_path: raise ValueError("Falta archivo.")
203
+
204
+ out = self._get_output_path("reparado.pdf")
205
+
206
+ # Comando mágico de Ghostscript para regenerar PDFs
207
+ # -sDEVICE=pdfwrite : Reescribe el PDF
208
+ # -dPDFSETTINGS=/default : Calidad normal
209
+ cmd = [
210
+ "gs",
211
+ "-o", out, # Output file
212
+ "-sDEVICE=pdfwrite",
213
+ "-dPDFSETTINGS=/default",
214
+ "-dInteract=N", # No interactivo
215
+ "-dNOPAUSE", "-dQUIET", "-dBATCH",
216
+ file_path
217
+ ]
218
+
219
+ try:
220
+ subprocess.run(cmd, check=True)
221
+ return out
222
+ except subprocess.CalledProcessError:
223
+ raise RuntimeError("Ghostscript no pudo reparar el archivo (daño severo).")
224
+ except Exception as e:
225
+ raise RuntimeError(f"Error sistema: {e}")
226
+
227
  def split_pdf_custom(self, file_path: str, range_str: str) -> str:
228
  if not file_path: raise ValueError("Falta archivo.")
229
  r = PdfReader(file_path)
 
316
  return out
317
  except Exception as e: raise RuntimeError(f"Error: {e}")
318
 
319
+ # --- COMPARACIÓN DE TEXTO PALABRA POR PALABRA ---
320
  def compare_pdfs_text(self, path_a: str, path_b: str) -> str:
 
 
 
 
321
  if not path_a or not path_b: raise ValueError("Faltan archivos.")
322
 
323
  def get_all_words(path):
 
327
  for page in reader.pages:
328
  extracted = page.extract_text()
329
  if extracted: text += extracted + " "
 
330
  return text.split()
331
  except Exception as e:
332
  raise RuntimeError(f"Error leyendo PDF: {e}")
333
 
 
334
  words_a = get_all_words(path_a)
335
  words_b = get_all_words(path_b)
 
 
336
  diff = difflib.ndiff(words_a, words_b)
337
 
 
338
  output_path = self._get_output_path("informe_diferencias_palabras.pdf")
339
  doc = SimpleDocTemplate(output_path, pagesize=A4)
340
  styles = getSampleStyleSheet()
341
 
 
342
  style_body = ParagraphStyle(
343
  'Body',
344
  parent=styles['BodyText'],
 
351
  story.append(Paragraph("Informe de Comparación (Modo Palabras)", styles['Heading1']))
352
  story.append(Spacer(1, 12))
353
 
 
354
  legend = '<b>Leyenda:</b> <font color="red"><strike>Eliminado</strike></font> | <font color="green"><b>Añadido</b></font> | Texto Común'
355
  story.append(Paragraph(legend, style_body))
356
  story.append(Spacer(1, 12))
357
  story.append(Paragraph(f"<b>A:</b> {os.path.basename(path_a)} | <b>B:</b> {os.path.basename(path_b)}", style_body))
358
  story.append(Spacer(1, 12))
359
 
 
 
 
 
360
  current_html = ""
361
  word_count = 0
362
 
363
  for token in diff:
364
  code = token[:2]
365
  word = token[2:]
 
 
366
  safe_word = word.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
367
 
368
  chunk = ""
369
  if code == '- ':
 
370
  chunk = f'<font color="red"><strike>{safe_word}</strike></font> '
371
  elif code == '+ ':
 
372
  chunk = f'<font color="green"><b>{safe_word}</b></font> '
373
  elif code == ' ':
 
374
  chunk = f'{safe_word} '
 
375
 
376
  current_html += chunk
377
  word_count += 1
378
 
379
+ if word_count > 300 and code == ' ':
 
380
  story.append(Paragraph(current_html, style_body))
381
  story.append(Spacer(1, 6))
382
  current_html = ""
383
  word_count = 0
384
 
 
385
  if current_html:
386
  story.append(Paragraph(current_html, style_body))
387
 
388
  doc.build(story)
389
  return output_path
390
 
391
+ # --- CONVERSIONES OFFICE ---
392
  def pdf_to_excel(self, f):
393
  if not f: raise ValueError("Falta archivo.")
394
  try: