DaniFera commited on
Commit
7dfc046
verified
1 Parent(s): 94a82f1

Update core.py

Browse files
Files changed (1) hide show
  1. core.py +65 -60
core.py CHANGED
@@ -1,4 +1,4 @@
1
- # Versi贸n 2.2: Core con Comparaci贸n de Texto (ReportLab)
2
  # Autor: Gemini (AI Assistant)
3
 
4
  import os
@@ -20,12 +20,13 @@ from reportlab.lib.pagesizes import A4
20
  from reportlab.lib import colors
21
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
22
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 
23
 
24
  from config import TEMP_DIR
25
 
26
  class PDFEngine:
27
 
28
- # --- UTILIDADES ---
29
  @staticmethod
30
  def _get_output_path(filename: str) -> str:
31
  unique_name = f"{uuid.uuid4().hex[:8]}_{filename}"
@@ -40,7 +41,6 @@ class PDFEngine:
40
  except: return {"pages": 0, "name": "Error", "title": ""}
41
 
42
  def _parse_range_groups(self, range_str: str, max_pages: int) -> list:
43
- # (L贸gica id茅ntica versiones anteriores)
44
  groups = []
45
  parts = range_str.split(',')
46
  for part in parts:
@@ -85,7 +85,6 @@ class PDFEngine:
85
  except: return None
86
 
87
  def get_preview_indices_from_string(self, range_str: str, max_pages: int) -> list:
88
- # (Igual que antes)
89
  key_pages = []
90
  parts = range_str.split(',')
91
  for part in parts:
@@ -150,7 +149,6 @@ class PDFEngine:
150
 
151
  def compress_pdf(self, file_path: str, power: int = 3) -> str:
152
  if not file_path: raise ValueError("Falta archivo.")
153
- # power: 1=Baja, 3=Media/eBook, 4=Alta/Screen
154
  q = {1: "/prepress", 3: "/ebook", 4: "/screen"}
155
  gs_set = q.get(power, "/ebook")
156
  out = self._get_output_path("comprimido.pdf")
@@ -210,88 +208,97 @@ class PDFEngine:
210
  return out
211
  except Exception as e: raise RuntimeError(f"Error: {e}")
212
 
213
- # --- NUEVA COMPARACI脫N DE TEXTO (v2.2) ---
214
  def compare_pdfs_text(self, path_a: str, path_b: str) -> str:
215
  """
216
- Compara el TEXTO de dos PDFs y genera un informe PDF con diferencias resaltadas.
217
- Rojo/Tachado: Eliminado. Verde/Negrita: A帽adido.
218
  """
219
  if not path_a or not path_b: raise ValueError("Faltan archivos.")
220
 
221
- # 1. Extraer texto completo
222
- def get_text_lines(path):
223
  try:
224
  reader = PdfReader(path)
225
  text = ""
226
  for page in reader.pages:
227
  extracted = page.extract_text()
228
- if extracted: text += extracted + "\n"
229
- # Dividir por l铆neas para comparaci贸n
230
- return text.splitlines()
231
  except Exception as e:
232
  raise RuntimeError(f"Error leyendo PDF: {e}")
233
 
234
- lines_a = get_text_lines(path_a)
235
- lines_b = get_text_lines(path_b)
 
236
 
237
- # 2. Calcular diferencias (Difflib)
238
- diff = difflib.ndiff(lines_a, lines_b)
239
 
240
- # 3. Generar PDF con ReportLab
241
- output_path = self._get_output_path("informe_diferencias.pdf")
242
  doc = SimpleDocTemplate(output_path, pagesize=A4)
243
  styles = getSampleStyleSheet()
244
 
245
- # Estilos personalizados
246
- style_normal = styles['BodyText']
247
- style_del = ParagraphStyle('Deleted', parent=style_normal, textColor=colors.red, backColor=colors.mistyrose, strike=True)
248
- style_add = ParagraphStyle('Added', parent=style_normal, textColor=colors.darkgreen, backColor=colors.honeydew, fontName='Helvetica-Bold')
249
- style_header = styles['Heading1']
 
 
 
250
 
251
  story = []
252
- story.append(Paragraph("Informe de Comparaci贸n de Texto", style_header))
 
 
 
 
 
 
 
253
  story.append(Spacer(1, 12))
254
- story.append(Paragraph(f"<b>Archivo A (Original):</b> {os.path.basename(path_a)}", style_normal))
255
- story.append(Paragraph(f"<b>Archivo B (Modificado):</b> {os.path.basename(path_b)}", style_normal))
256
- story.append(Spacer(1, 24))
257
 
258
- # Procesar diferencias
259
- # ndiff devuelve: '- texto' (borrado), '+ texto' (a帽adido), ' texto' (igual), '? ...' (metadatos intral铆nea)
 
260
 
261
- has_changes = False
 
262
 
263
- for line in diff:
264
- code = line[:2]
265
- content = line[2:].strip()
266
 
267
- # Escapar XML/HTML para ReportLab (evitar crash con <, >)
268
- content = content.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
269
-
270
- if not content: continue # Saltar l铆neas vac铆as
271
-
272
  if code == '- ':
273
- # Eliminado (Rojo)
274
- p = Paragraph(f"<strike>{content}</strike>", style_del)
275
- story.append(p)
276
- story.append(Spacer(1, 4))
277
- has_changes = True
278
  elif code == '+ ':
279
- # A帽adido (Verde)
280
- p = Paragraph(f"{content}", style_add)
281
- story.append(p)
282
- story.append(Spacer(1, 4))
283
- has_changes = True
284
  elif code == ' ':
285
- # Contexto (Gris谩ceo o normal)
286
- # Para no hacer el informe eterno, podr铆amos recortar contexto,
287
- # pero mejor ponerlo todo para leer el documento fluido.
288
- p = Paragraph(content, style_normal)
289
- story.append(p)
290
- story.append(Spacer(1, 2))
291
- # Ignoramos l铆neas que empiezan por '?' (son pistas de difflib sobre d贸nde est谩 el cambio en la palabra)
 
 
 
 
 
 
292
 
293
- if not has_changes:
294
- story.append(Paragraph("<b>No se encontraron diferencias textuales entre los documentos.</b>", style_normal))
 
295
 
296
  doc.build(story)
297
  return output_path
@@ -325,13 +332,11 @@ class PDFEngine:
325
  try:
326
  imgs = convert_from_path(f, dpi=150)
327
  prs = Presentation()
328
- # Layout blanco
329
  blank = 6
330
  for i, img in enumerate(imgs):
331
  ip = self._get_output_path(f"slide_{i}.jpg")
332
  img.save(ip, "JPEG")
333
  slide = prs.slides.add_slide(prs.slide_layouts[blank])
334
- # Ajustar imagen al ancho de la slide
335
  slide.shapes.add_picture(ip, Inches(0), Inches(0), width=prs.slide_width)
336
  out = self._get_output_path(os.path.basename(f).replace(".pdf", ".pptx"))
337
  prs.save(out)
 
1
+ # Versi贸n 2.3: Core con Comparaci贸n Palabra por Palabra (Precisi贸n Alta)
2
  # Autor: Gemini (AI Assistant)
3
 
4
  import os
 
20
  from reportlab.lib import colors
21
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
22
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
23
+ from reportlab.lib.enums import TA_JUSTIFY
24
 
25
  from config import TEMP_DIR
26
 
27
  class PDFEngine:
28
 
29
+ # --- UTILIDADES INTERNAS ---
30
  @staticmethod
31
  def _get_output_path(filename: str) -> str:
32
  unique_name = f"{uuid.uuid4().hex[:8]}_{filename}"
 
41
  except: return {"pages": 0, "name": "Error", "title": ""}
42
 
43
  def _parse_range_groups(self, range_str: str, max_pages: int) -> list:
 
44
  groups = []
45
  parts = range_str.split(',')
46
  for part in parts:
 
85
  except: return None
86
 
87
  def get_preview_indices_from_string(self, range_str: str, max_pages: int) -> list:
 
88
  key_pages = []
89
  parts = range_str.split(',')
90
  for part in parts:
 
149
 
150
  def compress_pdf(self, file_path: str, power: int = 3) -> str:
151
  if not file_path: raise ValueError("Falta archivo.")
 
152
  q = {1: "/prepress", 3: "/ebook", 4: "/screen"}
153
  gs_set = q.get(power, "/ebook")
154
  out = self._get_output_path("comprimido.pdf")
 
208
  return out
209
  except Exception as e: raise RuntimeError(f"Error: {e}")
210
 
211
+ # --- COMPARACI脫N DE TEXTO PALABRA POR PALABRA (v2.3) ---
212
  def compare_pdfs_text(self, path_a: str, path_b: str) -> str:
213
  """
214
+ Compara el TEXTO de dos PDFs tokenizando por PALABRAS.
215
+ Esto evita que l铆neas enteras se marquen como err贸neas si solo cambia una palabra.
216
  """
217
  if not path_a or not path_b: raise ValueError("Faltan archivos.")
218
 
219
+ def get_all_words(path):
 
220
  try:
221
  reader = PdfReader(path)
222
  text = ""
223
  for page in reader.pages:
224
  extracted = page.extract_text()
225
+ if extracted: text += extracted + " "
226
+ # Tokenizar por palabras (split elimina espacios extra y saltos de l铆nea)
227
+ return text.split()
228
  except Exception as e:
229
  raise RuntimeError(f"Error leyendo PDF: {e}")
230
 
231
+ # Obtenemos listas de palabras ['La', 'casa', 'es', 'roja'...]
232
+ words_a = get_all_words(path_a)
233
+ words_b = get_all_words(path_b)
234
 
235
+ # Calculamos diferencias palabra por palabra
236
+ diff = difflib.ndiff(words_a, words_b)
237
 
238
+ # Generar PDF
239
+ output_path = self._get_output_path("informe_diferencias_palabras.pdf")
240
  doc = SimpleDocTemplate(output_path, pagesize=A4)
241
  styles = getSampleStyleSheet()
242
 
243
+ # Estilo para el cuerpo del texto
244
+ style_body = ParagraphStyle(
245
+ 'Body',
246
+ parent=styles['BodyText'],
247
+ alignment=TA_JUSTIFY,
248
+ fontSize=11,
249
+ leading=14
250
+ )
251
 
252
  story = []
253
+ story.append(Paragraph("Informe de Comparaci贸n (Modo Palabras)", styles['Heading1']))
254
+ story.append(Spacer(1, 12))
255
+
256
+ # Leyenda
257
+ legend = '<b>Leyenda:</b> <font color="red"><strike>Eliminado</strike></font> | <font color="green"><b>A帽adido</b></font> | Texto Com煤n'
258
+ story.append(Paragraph(legend, style_body))
259
+ story.append(Spacer(1, 12))
260
+ story.append(Paragraph(f"<b>A:</b> {os.path.basename(path_a)} | <b>B:</b> {os.path.basename(path_b)}", style_body))
261
  story.append(Spacer(1, 12))
 
 
 
262
 
263
+ # Reconstrucci贸n del texto
264
+ # Acumularemos fragmentos HTML para crear p谩rrafos.
265
+ # ReportLab tiene l铆mites de tama帽o por p谩rrafo, as铆 que hacemos "flush" cada cierto tiempo.
266
 
267
+ current_html = ""
268
+ word_count = 0
269
 
270
+ for token in diff:
271
+ code = token[:2]
272
+ word = token[2:]
273
 
274
+ # Escapar caracteres especiales XML
275
+ safe_word = word.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
276
+
277
+ chunk = ""
 
278
  if code == '- ':
279
+ # Eliminado (Rojo + Tachado)
280
+ chunk = f'<font color="red"><strike>{safe_word}</strike></font> '
 
 
 
281
  elif code == '+ ':
282
+ # A帽adido (Verde + Negrita)
283
+ chunk = f'<font color="green"><b>{safe_word}</b></font> '
 
 
 
284
  elif code == ' ':
285
+ # Igual (Negro)
286
+ chunk = f'{safe_word} '
287
+ # Ignoramos l铆neas '?'
288
+
289
+ current_html += chunk
290
+ word_count += 1
291
+
292
+ # Crear un nuevo p谩rrafo cada ~300 palabras para evitar problemas de renderizado
293
+ if word_count > 300 and code == ' ': # Cortar preferiblemente en texto normal
294
+ story.append(Paragraph(current_html, style_body))
295
+ story.append(Spacer(1, 6))
296
+ current_html = ""
297
+ word_count = 0
298
 
299
+ # A帽adir el resto
300
+ if current_html:
301
+ story.append(Paragraph(current_html, style_body))
302
 
303
  doc.build(story)
304
  return output_path
 
332
  try:
333
  imgs = convert_from_path(f, dpi=150)
334
  prs = Presentation()
 
335
  blank = 6
336
  for i, img in enumerate(imgs):
337
  ip = self._get_output_path(f"slide_{i}.jpg")
338
  img.save(ip, "JPEG")
339
  slide = prs.slides.add_slide(prs.slide_layouts[blank])
 
340
  slide.shapes.add_picture(ip, Inches(0), Inches(0), width=prs.slide_width)
341
  out = self._get_output_path(os.path.basename(f).replace(".pdf", ".pptx"))
342
  prs.save(out)