DaniFera commited on
Commit
df4812e
verified
1 Parent(s): 8368ba5

Update core.py

Browse files
Files changed (1) hide show
  1. core.py +258 -257
core.py CHANGED
@@ -1,11 +1,11 @@
1
- # Versi贸n 2.0: Core Completo con Excel y PowerPoint
 
2
 
3
  import os
4
  import zipfile
5
  import uuid
6
  import subprocess
7
- import cv2
8
- import numpy as np
9
  import pdfplumber
10
  import pandas as pd
11
  from pypdf import PdfWriter, PdfReader
@@ -14,19 +14,20 @@ from pdf2docx import Converter
14
  from PIL import Image
15
  from pptx import Presentation
16
  from pptx.util import Inches
 
 
 
 
 
 
 
17
  from config import TEMP_DIR
18
 
19
  class PDFEngine:
20
- """
21
- Clase principal que encapsula toda la l贸gica de manipulaci贸n de archivos.
22
- Sigue el principio de Responsabilidad 脷nica (SRP).
23
- """
24
-
25
- # --- UTILIDADES INTERNAS ---
26
 
 
27
  @staticmethod
28
  def _get_output_path(filename: str) -> str:
29
- """Genera una ruta 煤nica en el directorio temporal."""
30
  unique_name = f"{uuid.uuid4().hex[:8]}_{filename}"
31
  return os.path.join(TEMP_DIR, unique_name)
32
 
@@ -36,10 +37,10 @@ class PDFEngine:
36
  meta = reader.metadata
37
  title = meta.title if meta and meta.title else "Sin t铆tulo"
38
  return {"pages": len(reader.pages), "name": os.path.basename(file_path), "title": title}
39
- except Exception:
40
- return {"pages": 0, "name": "Error", "title": ""}
41
 
42
  def _parse_range_groups(self, range_str: str, max_pages: int) -> list:
 
43
  groups = []
44
  parts = range_str.split(',')
45
  for part in parts:
@@ -61,31 +62,30 @@ class PDFEngine:
61
  if current_group: groups.append({"label": part, "indices": current_group})
62
  return groups
63
 
64
- # --- PREVISUALIZACI脫N ---
65
-
66
- def generate_preview(self, file_path: str, page_number: int) -> str:
67
  try:
68
- images = convert_from_path(file_path, first_page=page_number, last_page=page_number, size=(None, 400))
69
- if images:
70
- output_path = self._get_output_path(f"preview_pg{page_number}.jpg")
71
- images[0].save(output_path, "JPEG")
72
- return output_path
73
- return None
74
- except Exception: return None
75
-
76
- def get_rotated_preview(self, file_path: str, angle: int) -> str:
77
- if not file_path: return None
78
  try:
79
- images = convert_from_path(file_path, first_page=1, last_page=1, size=(None, 500))
80
- if not images: return None
81
- img = images[0]
82
- if angle != 0: img = img.rotate(-angle, expand=True)
83
- output_path = self._get_output_path(f"preview_rot_{angle}.jpg")
84
- img.save(output_path, "JPEG")
85
- return output_path
86
- except Exception: return None
87
 
88
  def get_preview_indices_from_string(self, range_str: str, max_pages: int) -> list:
 
89
  key_pages = []
90
  parts = range_str.split(',')
91
  for part in parts:
@@ -102,119 +102,259 @@ class PDFEngine:
102
  except ValueError: continue
103
  return sorted(list(set(key_pages)))
104
 
105
- # --- FUNCIONALIDADES DE GESTI脫N DE P脕GINAS ---
106
-
107
  def merge_pdfs(self, file_paths: list, order_indices: list = None) -> str:
108
  if not file_paths: raise ValueError("No hay archivos.")
109
- ordered_paths = []
110
  if order_indices and len(order_indices) == len(file_paths):
111
- try:
112
- for idx in order_indices: ordered_paths.append(file_paths[int(idx)])
113
- except: ordered_paths = file_paths
114
- else: ordered_paths = file_paths
115
- merger = PdfWriter()
116
- try:
117
- for path in ordered_paths: merger.append(path)
118
- output_path = self._get_output_path("unido_ordenado.pdf")
119
- with open(output_path, "wb") as f: merger.write(f)
120
- return output_path
121
- except Exception as e: raise RuntimeError(f"Error al unir: {str(e)}")
122
- finally: merger.close()
123
 
124
  def split_pdf_custom(self, file_path: str, range_str: str) -> str:
125
  if not file_path: raise ValueError("Falta archivo.")
126
- reader = PdfReader(file_path)
127
- total = len(reader.pages)
128
- groups = self._parse_range_groups(range_str, total)
129
- if not groups: raise ValueError("Rango inv谩lido.")
130
- generated = []
131
  base = os.path.basename(file_path).replace(".pdf", "")
132
- for group in groups:
133
- writer = PdfWriter()
134
- for idx in group["indices"]: writer.add_page(reader.pages[idx])
135
  safe = group["label"].replace(" ", "")
136
- out = self._get_output_path(f"{base}_part_{safe}.pdf")
137
- with open(out, "wb") as f: writer.write(f)
138
- generated.append(out)
139
- zname = f"{base}_split_files.zip"
140
- zpath = self._get_output_path(zname)
141
- with zipfile.ZipFile(zpath, 'w') as zipf:
142
- for f in generated: zipf.write(f, arcname=os.path.basename(f))
143
- return zpath
144
 
145
  def reorder_pages(self, file_path: str, order_str: str) -> str:
146
  if not file_path: raise ValueError("Falta archivo.")
147
- reader = PdfReader(file_path)
148
- groups = self._parse_range_groups(order_str, len(reader.pages))
149
- if not groups: raise ValueError("Orden inv谩lido.")
150
- flat = []
151
- for g in groups: flat.extend(g["indices"])
152
- writer = PdfWriter()
153
- for idx in flat: writer.add_page(reader.pages[idx])
154
  out = self._get_output_path("reordenado.pdf")
155
- with open(out, "wb") as f: writer.write(f)
156
  return out
157
 
158
- # --- EDICI脫N Y SEGURIDAD ---
159
-
160
- def compress_pdf(self, file_path: str, power: int = 2) -> str:
161
  if not file_path: raise ValueError("Falta archivo.")
162
- quality = {0: "/default", 1: "/prepress", 2: "/printer", 3: "/ebook", 4: "/screen"}
163
- gs_setting = quality.get(power, "/ebook")
164
- output_path = self._get_output_path("comprimido.pdf")
165
- cmd = ["gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", f"-dPDFSETTINGS={gs_setting}", "-dNOPAUSE", "-dQUIET", "-dBATCH", f"-sOutputFile={output_path}", file_path]
 
166
  try:
167
  subprocess.run(cmd, check=True)
168
- return output_path
169
- except subprocess.CalledProcessError as e: raise RuntimeError(f"Error Ghostscript: {e}")
170
- except FileNotFoundError: raise RuntimeError("Falta Ghostscript (packages.txt).")
171
 
172
  def protect_pdf(self, file_path: str, password: str) -> str:
173
  if not file_path or not password: raise ValueError("Faltan datos.")
174
  try:
175
- reader = PdfReader(file_path)
176
- writer = PdfWriter()
177
- for page in reader.pages: writer.add_page(page)
178
- writer.encrypt(password)
179
  out = self._get_output_path("protegido.pdf")
180
- with open(out, "wb") as f: writer.write(f)
181
  return out
182
- except Exception as e: raise RuntimeError(f"Error proteger: {e}")
183
 
184
  def rotate_pdf(self, file_path: str, angle: int) -> str:
185
  if not file_path: raise ValueError("Falta archivo.")
186
  try:
187
- reader = PdfReader(file_path)
188
- writer = PdfWriter()
189
- for page in reader.pages:
190
- page.rotate(angle)
191
- writer.add_page(page)
192
  out = self._get_output_path(f"rotado_{angle}.pdf")
193
- with open(out, "wb") as f: writer.write(f)
194
  return out
195
- except Exception as e: raise RuntimeError(f"Error rotar: {e}")
196
 
197
- def update_metadata(self, file_path: str, title: str, author: str, subject: str) -> str:
198
- if not file_path: raise ValueError("Falta archivo.")
199
  try:
200
- reader = PdfReader(file_path)
201
- writer = PdfWriter()
202
- for page in reader.pages: writer.add_page(page)
203
- writer.add_metadata({"/Title": title, "/Author": author, "/Subject": subject, "/Producer": "OpenPDF Tools"})
204
- out = self._get_output_path("editado_meta.pdf")
205
- with open(out, "wb") as f: writer.write(f)
206
  return out
207
- except Exception as e: raise RuntimeError(f"Error metadata: {e}")
208
 
209
- # --- CONVERSIONES GENERALES ---
 
 
 
 
 
 
 
 
 
 
 
210
 
211
- def pdf_to_images_zip(self, file_path: str) -> str:
212
- if not file_path: raise ValueError("Falta archivo.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  try:
214
- images = convert_from_path(file_path, dpi=150)
215
- base = os.path.basename(file_path).replace(".pdf", "")
 
 
 
 
 
 
 
 
 
216
  paths = []
217
- for i, img in enumerate(images):
 
218
  p = self._get_output_path(f"{base}_{i+1}.jpg")
219
  img.save(p, "JPEG")
220
  paths.append(p)
@@ -222,156 +362,17 @@ class PDFEngine:
222
  with zipfile.ZipFile(zp, 'w') as z:
223
  for p in paths: z.write(p, arcname=os.path.basename(p))
224
  return zp
225
- except Exception as e: raise RuntimeError(f"Error PDF->IMG: {e}")
226
 
227
- def images_to_pdf(self, image_paths: list) -> str:
228
- if not image_paths: raise ValueError("No im谩genes.")
229
  try:
230
  objs = []
231
- for p in image_paths:
232
- img = Image.open(p)
233
- if img.mode != 'RGB': img = img.convert('RGB')
234
- objs.append(img)
235
  out = self._get_output_path("album.pdf")
236
  if objs: objs[0].save(out, "PDF", resolution=100.0, save_all=True, append_images=objs[1:])
237
  return out
238
- except Exception as e: raise RuntimeError(f"Error IMG->PDF: {e}")
239
-
240
- def pdf_to_word(self, file_path: str) -> str:
241
- if not file_path: raise ValueError("Falta archivo.")
242
- try:
243
- docx = os.path.basename(file_path).replace(".pdf", ".docx")
244
- out = self._get_output_path(docx)
245
- cv = Converter(file_path)
246
- cv.convert(out, start=0, end=None)
247
- cv.close()
248
- return out
249
- except Exception as e: raise RuntimeError(f"Error PDF->Word: {e}")
250
-
251
- def extract_text(self, file_path: str) -> str:
252
- if not file_path: raise ValueError("Falta archivo.")
253
- try:
254
- reader = PdfReader(file_path)
255
- content = []
256
- for i, page in enumerate(reader.pages):
257
- txt = page.extract_text()
258
- if txt: content.append(f"--- P谩g {i+1} ---\n{txt}\n")
259
- out = self._get_output_path(os.path.basename(file_path).replace(".pdf", ".txt"))
260
- with open(out, "w", encoding="utf-8") as f: f.write("\n".join(content))
261
- return out
262
- except Exception as e: raise RuntimeError(f"Error texto: {e}")
263
-
264
- # --- NUEVAS CONVERSIONES OFFICE (v2.0) ---
265
-
266
- def pdf_to_excel(self, file_path: str) -> str:
267
- """
268
- Extrae tablas del PDF y las guarda en un Excel (XLSX).
269
- Crea una hoja por cada p谩gina que contenga tablas.
270
- """
271
- if not file_path: raise ValueError("Falta archivo.")
272
-
273
- try:
274
- xlsx_name = os.path.basename(file_path).replace(".pdf", ".xlsx")
275
- output_path = self._get_output_path(xlsx_name)
276
-
277
- has_tables = False
278
-
279
- # Usamos ExcelWriter para escribir m煤ltiples hojas
280
- with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
281
- with pdfplumber.open(file_path) as pdf:
282
- for i, page in enumerate(pdf.pages):
283
- tables = page.extract_tables()
284
- if tables:
285
- has_tables = True
286
- # Si hay varias tablas en una p谩gina, las concatenamos o las ponemos una debajo de otra
287
- # Aqu铆, por simplicidad, cogemos la tabla m谩s grande o concatenamos
288
- df_page = pd.DataFrame()
289
- for table in tables:
290
- df = pd.DataFrame(table)
291
- # Usar la primera fila como header si parece un header
292
- new_header = df.iloc[0]
293
- df = df[1:]
294
- df.columns = new_header
295
- df_page = pd.concat([df_page, df], ignore_index=True)
296
-
297
- sheet_name = f"Pagina_{i+1}"
298
- df_page.to_excel(writer, sheet_name=sheet_name, index=False)
299
-
300
- if not has_tables:
301
- raise ValueError("No se detectaron tablas con bordes claros en este PDF.")
302
-
303
- return output_path
304
- except Exception as e:
305
- raise RuntimeError(f"Error PDF->Excel: {str(e)}")
306
-
307
- def pdf_to_pptx(self, file_path: str) -> str:
308
- """
309
- Convierte PDF a PowerPoint (PPTX).
310
- Estrategia: Convertir cada p谩gina a Imagen -> Pegar en Diapositiva.
311
- Esto preserva el formato visual exacto.
312
- """
313
- if not file_path: raise ValueError("Falta archivo.")
314
-
315
- try:
316
- # 1. Convertir PDF a im谩genes (HQ)
317
- images = convert_from_path(file_path, dpi=200)
318
-
319
- # 2. Crear presentaci贸n
320
- prs = Presentation()
321
-
322
- # Definir layout en blanco (index 6 suele ser blank en tema default)
323
- BLANK_SLIDE_LAYOUT = 6
324
-
325
- for i, img in enumerate(images):
326
- # Guardar imagen temporal
327
- img_path = self._get_output_path(f"temp_slide_{i}.jpg")
328
- img.save(img_path, "JPEG")
329
-
330
- # A帽adir diapositiva
331
- slide = prs.slides.add_slide(prs.slide_layouts[BLANK_SLIDE_LAYOUT])
332
-
333
- # Ajustar tama帽o de la diapositiva al tama帽o de la imagen?
334
- # Por simplicidad, ajustamos la imagen al tama帽o de la diapositiva est谩ndar (10x7.5 inches)
335
- # left, top, width, height
336
- slide.shapes.add_picture(img_path, Inches(0), Inches(0), width=prs.slide_width)
337
-
338
- pptx_name = os.path.basename(file_path).replace(".pdf", ".pptx")
339
- output_path = self._get_output_path(pptx_name)
340
- prs.save(output_path)
341
-
342
- return output_path
343
- except Exception as e:
344
- raise RuntimeError(f"Error PDF->PPTX: {str(e)}")
345
-
346
- # --- AN脕LISIS ---
347
-
348
- def compare_pdfs_visual(self, path_a: str, path_b: str) -> str:
349
- if not path_a or not path_b: raise ValueError("Dos archivos requeridos.")
350
- try:
351
- imgs_a = convert_from_path(path_a, dpi=100)
352
- imgs_b = convert_from_path(path_b, dpi=100)
353
- except Exception as e: raise RuntimeError(f"Error leyendo PDFs: {e}")
354
-
355
- min_pages = min(len(imgs_a), len(imgs_b))
356
- diff_pages = []
357
- for i in range(min_pages):
358
- arr_a = np.array(imgs_a[i])
359
- arr_b = np.array(imgs_b[i])
360
- if arr_a.shape != arr_b.shape:
361
- h, w = arr_a.shape[:2]
362
- arr_b = cv2.resize(arr_b, (w, h))
363
- gray_a = cv2.cvtColor(arr_a, cv2.COLOR_RGB2GRAY)
364
- gray_b = cv2.cvtColor(arr_b, cv2.COLOR_RGB2GRAY)
365
- diff = cv2.absdiff(gray_a, gray_b)
366
- _, thresh = cv2.threshold(diff, 30, 255, cv2.THRESH_BINARY)
367
- contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
368
- res = arr_a.copy()
369
- for cnt in contours:
370
- x, y, w, h = cv2.boundingRect(cnt)
371
- cv2.rectangle(res, (x, y), (x + w, y + h), (255, 0, 255), 2)
372
- diff_pages.append(Image.fromarray(res))
373
-
374
- if not diff_pages: raise ValueError("Error en comparaci贸n.")
375
- out = self._get_output_path("comparativa.pdf")
376
- diff_pages[0].save(out, "PDF", resolution=100.0, save_all=True, append_images=diff_pages[1:])
377
- return out
 
1
+ # Versi贸n 2.2: Core con Comparaci贸n de Texto (ReportLab)
2
+ # Autor: Gemini (AI Assistant)
3
 
4
  import os
5
  import zipfile
6
  import uuid
7
  import subprocess
8
+ import difflib
 
9
  import pdfplumber
10
  import pandas as pd
11
  from pypdf import PdfWriter, PdfReader
 
14
  from PIL import Image
15
  from pptx import Presentation
16
  from pptx.util import Inches
17
+
18
+ # ReportLab para generar el PDF de diferencias
19
+ from reportlab.lib.pagesizes import A4
20
+ from reportlab.lib import colors
21
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
22
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
23
+
24
  from config import TEMP_DIR
25
 
26
  class PDFEngine:
 
 
 
 
 
 
27
 
28
+ # --- UTILIDADES ---
29
  @staticmethod
30
  def _get_output_path(filename: str) -> str:
 
31
  unique_name = f"{uuid.uuid4().hex[:8]}_{filename}"
32
  return os.path.join(TEMP_DIR, unique_name)
33
 
 
37
  meta = reader.metadata
38
  title = meta.title if meta and meta.title else "Sin t铆tulo"
39
  return {"pages": len(reader.pages), "name": os.path.basename(file_path), "title": title}
40
+ except: return {"pages": 0, "name": "Error", "title": ""}
 
41
 
42
  def _parse_range_groups(self, range_str: str, max_pages: int) -> list:
43
+ # (L贸gica id茅ntica versiones anteriores)
44
  groups = []
45
  parts = range_str.split(',')
46
  for part in parts:
 
62
  if current_group: groups.append({"label": part, "indices": current_group})
63
  return groups
64
 
65
+ # --- PREVIEW ---
66
+ def generate_preview(self, f, p):
 
67
  try:
68
+ imgs = convert_from_path(f, first_page=p, last_page=p, size=(None, 400))
69
+ if imgs:
70
+ out = self._get_output_path(f"preview_pg{p}.jpg")
71
+ imgs[0].save(out, "JPEG")
72
+ return out
73
+ except: return None
74
+
75
+ def get_rotated_preview(self, f, a):
76
+ if not f: return None
 
77
  try:
78
+ imgs = convert_from_path(f, first_page=1, last_page=1, size=(None, 500))
79
+ if not imgs: return None
80
+ img = imgs[0]
81
+ if a != 0: img = img.rotate(-a, expand=True)
82
+ out = self._get_output_path(f"rot_prev_{a}.jpg")
83
+ img.save(out, "JPEG")
84
+ return out
85
+ except: return None
86
 
87
  def get_preview_indices_from_string(self, range_str: str, max_pages: int) -> list:
88
+ # (Igual que antes)
89
  key_pages = []
90
  parts = range_str.split(',')
91
  for part in parts:
 
102
  except ValueError: continue
103
  return sorted(list(set(key_pages)))
104
 
105
+ # --- CORE PDF TOOLS ---
 
106
  def merge_pdfs(self, file_paths: list, order_indices: list = None) -> str:
107
  if not file_paths: raise ValueError("No hay archivos.")
108
+ ordered = []
109
  if order_indices and len(order_indices) == len(file_paths):
110
+ try: ordered = [file_paths[int(i)] for i in order_indices]
111
+ except: ordered = file_paths
112
+ else: ordered = file_paths
113
+ m = PdfWriter()
114
+ for p in ordered: m.append(p)
115
+ out = self._get_output_path("unido.pdf")
116
+ with open(out, "wb") as f: m.write(f)
117
+ m.close()
118
+ return out
 
 
 
119
 
120
  def split_pdf_custom(self, file_path: str, range_str: str) -> str:
121
  if not file_path: raise ValueError("Falta archivo.")
122
+ r = PdfReader(file_path)
123
+ g = self._parse_range_groups(range_str, len(r.pages))
124
+ if not g: raise ValueError("Rango inv谩lido.")
125
+ gen = []
 
126
  base = os.path.basename(file_path).replace(".pdf", "")
127
+ for group in g:
128
+ w = PdfWriter()
129
+ for i in group["indices"]: w.add_page(r.pages[i])
130
  safe = group["label"].replace(" ", "")
131
+ p = self._get_output_path(f"{base}_part_{safe}.pdf")
132
+ with open(p, "wb") as f: w.write(f)
133
+ gen.append(p)
134
+ zp = self._get_output_path(f"{base}_split.zip")
135
+ with zipfile.ZipFile(zp, 'w') as z:
136
+ for f in gen: z.write(f, arcname=os.path.basename(f))
137
+ return zp
 
138
 
139
  def reorder_pages(self, file_path: str, order_str: str) -> str:
140
  if not file_path: raise ValueError("Falta archivo.")
141
+ r = PdfReader(file_path)
142
+ g = self._parse_range_groups(order_str, len(r.pages))
143
+ if not g: raise ValueError("Orden inv谩lido.")
144
+ w = PdfWriter()
145
+ flat = [i for group in g for i in group["indices"]]
146
+ for i in flat: w.add_page(r.pages[i])
 
147
  out = self._get_output_path("reordenado.pdf")
148
+ with open(out, "wb") as f: w.write(f)
149
  return out
150
 
151
+ def compress_pdf(self, file_path: str, power: int = 3) -> str:
 
 
152
  if not file_path: raise ValueError("Falta archivo.")
153
+ # power: 1=Baja, 3=Media/eBook, 4=Alta/Screen
154
+ q = {1: "/prepress", 3: "/ebook", 4: "/screen"}
155
+ gs_set = q.get(power, "/ebook")
156
+ out = self._get_output_path("comprimido.pdf")
157
+ cmd = ["gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", f"-dPDFSETTINGS={gs_set}", "-dNOPAUSE", "-dQUIET", "-dBATCH", f"-sOutputFile={out}", file_path]
158
  try:
159
  subprocess.run(cmd, check=True)
160
+ return out
161
+ except: raise RuntimeError("Error comprimiendo (Ghostscript).")
 
162
 
163
  def protect_pdf(self, file_path: str, password: str) -> str:
164
  if not file_path or not password: raise ValueError("Faltan datos.")
165
  try:
166
+ r = PdfReader(file_path)
167
+ w = PdfWriter()
168
+ for p in r.pages: w.add_page(p)
169
+ w.encrypt(password)
170
  out = self._get_output_path("protegido.pdf")
171
+ with open(out, "wb") as f: w.write(f)
172
  return out
173
+ except Exception as e: raise RuntimeError(f"Error: {e}")
174
 
175
  def rotate_pdf(self, file_path: str, angle: int) -> str:
176
  if not file_path: raise ValueError("Falta archivo.")
177
  try:
178
+ r = PdfReader(file_path)
179
+ w = PdfWriter()
180
+ for p in r.pages:
181
+ p.rotate(angle)
182
+ w.add_page(p)
183
  out = self._get_output_path(f"rotado_{angle}.pdf")
184
+ with open(out, "wb") as f: w.write(f)
185
  return out
186
+ except Exception as e: raise RuntimeError(f"Error: {e}")
187
 
188
+ def update_metadata(self, f, t, a, s):
189
+ if not f: raise ValueError("Falta archivo.")
190
  try:
191
+ r = PdfReader(f)
192
+ w = PdfWriter()
193
+ for p in r.pages: w.add_page(p)
194
+ w.add_metadata({"/Title": t, "/Author": a, "/Subject": s, "/Producer": "OpenPDF Tools"})
195
+ out = self._get_output_path("meta.pdf")
196
+ with open(out, "wb") as outf: w.write(outf)
197
  return out
198
+ except Exception as e: raise RuntimeError(f"Error: {e}")
199
 
200
+ def extract_text(self, f):
201
+ if not f: raise ValueError("Falta archivo.")
202
+ try:
203
+ r = PdfReader(f)
204
+ txts = []
205
+ for i, p in enumerate(r.pages):
206
+ t = p.extract_text()
207
+ if t: txts.append(f"--- P谩g {i+1} ---\n{t}\n")
208
+ out = self._get_output_path(os.path.basename(f).replace(".pdf", ".txt"))
209
+ with open(out, "w", encoding="utf-8") as file: file.write("\n".join(txts))
210
+ return out
211
+ except Exception as e: raise RuntimeError(f"Error: {e}")
212
 
213
+ # --- NUEVA COMPARACI脫N DE TEXTO (v2.2) ---
214
+ def compare_pdfs_text(self, path_a: str, path_b: str) -> str:
215
+ """
216
+ Compara el TEXTO de dos PDFs y genera un informe PDF con diferencias resaltadas.
217
+ Rojo/Tachado: Eliminado. Verde/Negrita: A帽adido.
218
+ """
219
+ if not path_a or not path_b: raise ValueError("Faltan archivos.")
220
+
221
+ # 1. Extraer texto completo
222
+ def get_text_lines(path):
223
+ try:
224
+ reader = PdfReader(path)
225
+ text = ""
226
+ for page in reader.pages:
227
+ extracted = page.extract_text()
228
+ if extracted: text += extracted + "\n"
229
+ # Dividir por l铆neas para comparaci贸n
230
+ return text.splitlines()
231
+ except Exception as e:
232
+ raise RuntimeError(f"Error leyendo PDF: {e}")
233
+
234
+ lines_a = get_text_lines(path_a)
235
+ lines_b = get_text_lines(path_b)
236
+
237
+ # 2. Calcular diferencias (Difflib)
238
+ diff = difflib.ndiff(lines_a, lines_b)
239
+
240
+ # 3. Generar PDF con ReportLab
241
+ output_path = self._get_output_path("informe_diferencias.pdf")
242
+ doc = SimpleDocTemplate(output_path, pagesize=A4)
243
+ styles = getSampleStyleSheet()
244
+
245
+ # Estilos personalizados
246
+ style_normal = styles['BodyText']
247
+ style_del = ParagraphStyle('Deleted', parent=style_normal, textColor=colors.red, backColor=colors.mistyrose, strike=True)
248
+ style_add = ParagraphStyle('Added', parent=style_normal, textColor=colors.darkgreen, backColor=colors.honeydew, fontName='Helvetica-Bold')
249
+ style_header = styles['Heading1']
250
+
251
+ story = []
252
+ story.append(Paragraph("Informe de Comparaci贸n de Texto", style_header))
253
+ story.append(Spacer(1, 12))
254
+ story.append(Paragraph(f"<b>Archivo A (Original):</b> {os.path.basename(path_a)}", style_normal))
255
+ story.append(Paragraph(f"<b>Archivo B (Modificado):</b> {os.path.basename(path_b)}", style_normal))
256
+ story.append(Spacer(1, 24))
257
+
258
+ # Procesar diferencias
259
+ # ndiff devuelve: '- texto' (borrado), '+ texto' (a帽adido), ' texto' (igual), '? ...' (metadatos intral铆nea)
260
+
261
+ has_changes = False
262
+
263
+ for line in diff:
264
+ code = line[:2]
265
+ content = line[2:].strip()
266
+
267
+ # Escapar XML/HTML para ReportLab (evitar crash con <, >)
268
+ content = content.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
269
+
270
+ if not content: continue # Saltar l铆neas vac铆as
271
+
272
+ if code == '- ':
273
+ # Eliminado (Rojo)
274
+ p = Paragraph(f"<strike>{content}</strike>", style_del)
275
+ story.append(p)
276
+ story.append(Spacer(1, 4))
277
+ has_changes = True
278
+ elif code == '+ ':
279
+ # A帽adido (Verde)
280
+ p = Paragraph(f"{content}", style_add)
281
+ story.append(p)
282
+ story.append(Spacer(1, 4))
283
+ has_changes = True
284
+ elif code == ' ':
285
+ # Contexto (Gris谩ceo o normal)
286
+ # Para no hacer el informe eterno, podr铆amos recortar contexto,
287
+ # pero mejor ponerlo todo para leer el documento fluido.
288
+ p = Paragraph(content, style_normal)
289
+ story.append(p)
290
+ story.append(Spacer(1, 2))
291
+ # Ignoramos l铆neas que empiezan por '?' (son pistas de difflib sobre d贸nde est谩 el cambio en la palabra)
292
+
293
+ if not has_changes:
294
+ story.append(Paragraph("<b>No se encontraron diferencias textuales entre los documentos.</b>", style_normal))
295
+
296
+ doc.build(story)
297
+ return output_path
298
+
299
+ # --- CONVERSIONES OFFICE (v2.0) ---
300
+ def pdf_to_excel(self, f):
301
+ if not f: raise ValueError("Falta archivo.")
302
+ try:
303
+ out = self._get_output_path(os.path.basename(f).replace(".pdf", ".xlsx"))
304
+ found = False
305
+ with pd.ExcelWriter(out, engine='openpyxl') as w:
306
+ with pdfplumber.open(f) as pdf:
307
+ for i, p in enumerate(pdf.pages):
308
+ tabs = p.extract_tables()
309
+ if tabs:
310
+ found = True
311
+ df_p = pd.DataFrame()
312
+ for t in tabs:
313
+ df = pd.DataFrame(t)
314
+ header = df.iloc[0]
315
+ df = df[1:]
316
+ df.columns = header
317
+ df_p = pd.concat([df_p, df], ignore_index=True)
318
+ df_p.to_excel(w, sheet_name=f"Pag_{i+1}", index=False)
319
+ if not found: raise ValueError("No se encontraron tablas.")
320
+ return out
321
+ except Exception as e: raise RuntimeError(f"Error Excel: {e}")
322
+
323
+ def pdf_to_pptx(self, f):
324
+ if not f: raise ValueError("Falta archivo.")
325
+ try:
326
+ imgs = convert_from_path(f, dpi=150)
327
+ prs = Presentation()
328
+ # Layout blanco
329
+ blank = 6
330
+ for i, img in enumerate(imgs):
331
+ ip = self._get_output_path(f"slide_{i}.jpg")
332
+ img.save(ip, "JPEG")
333
+ slide = prs.slides.add_slide(prs.slide_layouts[blank])
334
+ # Ajustar imagen al ancho de la slide
335
+ slide.shapes.add_picture(ip, Inches(0), Inches(0), width=prs.slide_width)
336
+ out = self._get_output_path(os.path.basename(f).replace(".pdf", ".pptx"))
337
+ prs.save(out)
338
+ return out
339
+ except Exception as e: raise RuntimeError(f"Error PPTX: {e}")
340
+
341
+ def pdf_to_word(self, f):
342
+ if not f: raise ValueError("Falta archivo.")
343
  try:
344
+ out = self._get_output_path(os.path.basename(f).replace(".pdf", ".docx"))
345
+ cv = Converter(f)
346
+ cv.convert(out, start=0, end=None)
347
+ cv.close()
348
+ return out
349
+ except Exception as e: raise RuntimeError(f"Error Word: {e}")
350
+
351
+ def pdf_to_images_zip(self, f):
352
+ if not f: raise ValueError("Falta archivo.")
353
+ try:
354
+ imgs = convert_from_path(f, dpi=150)
355
  paths = []
356
+ base = os.path.basename(f).replace(".pdf", "")
357
+ for i, img in enumerate(imgs):
358
  p = self._get_output_path(f"{base}_{i+1}.jpg")
359
  img.save(p, "JPEG")
360
  paths.append(p)
 
362
  with zipfile.ZipFile(zp, 'w') as z:
363
  for p in paths: z.write(p, arcname=os.path.basename(p))
364
  return zp
365
+ except: raise RuntimeError("Error imgs")
366
 
367
+ def images_to_pdf(self, fs):
368
+ if not fs: raise ValueError("No imgs.")
369
  try:
370
  objs = []
371
+ for p in fs:
372
+ i = Image.open(p)
373
+ if i.mode != 'RGB': i = i.convert('RGB')
374
+ objs.append(i)
375
  out = self._get_output_path("album.pdf")
376
  if objs: objs[0].save(out, "PDF", resolution=100.0, save_all=True, append_images=objs[1:])
377
  return out
378
+ except: raise RuntimeError("Error pdf")