Update core.py
Browse files
core.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# Versi贸n 2.
|
| 2 |
# Autor: Gemini (AI Assistant)
|
| 3 |
|
| 4 |
import os
|
|
@@ -20,12 +20,13 @@ from reportlab.lib.pagesizes import A4
|
|
| 20 |
from reportlab.lib import colors
|
| 21 |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
| 22 |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
|
|
|
| 23 |
|
| 24 |
from config import TEMP_DIR
|
| 25 |
|
| 26 |
class PDFEngine:
|
| 27 |
|
| 28 |
-
# --- UTILIDADES ---
|
| 29 |
@staticmethod
|
| 30 |
def _get_output_path(filename: str) -> str:
|
| 31 |
unique_name = f"{uuid.uuid4().hex[:8]}_{filename}"
|
|
@@ -40,7 +41,6 @@ class PDFEngine:
|
|
| 40 |
except: return {"pages": 0, "name": "Error", "title": ""}
|
| 41 |
|
| 42 |
def _parse_range_groups(self, range_str: str, max_pages: int) -> list:
|
| 43 |
-
# (L贸gica id茅ntica versiones anteriores)
|
| 44 |
groups = []
|
| 45 |
parts = range_str.split(',')
|
| 46 |
for part in parts:
|
|
@@ -85,7 +85,6 @@ class PDFEngine:
|
|
| 85 |
except: return None
|
| 86 |
|
| 87 |
def get_preview_indices_from_string(self, range_str: str, max_pages: int) -> list:
|
| 88 |
-
# (Igual que antes)
|
| 89 |
key_pages = []
|
| 90 |
parts = range_str.split(',')
|
| 91 |
for part in parts:
|
|
@@ -150,7 +149,6 @@ class PDFEngine:
|
|
| 150 |
|
| 151 |
def compress_pdf(self, file_path: str, power: int = 3) -> str:
|
| 152 |
if not file_path: raise ValueError("Falta archivo.")
|
| 153 |
-
# power: 1=Baja, 3=Media/eBook, 4=Alta/Screen
|
| 154 |
q = {1: "/prepress", 3: "/ebook", 4: "/screen"}
|
| 155 |
gs_set = q.get(power, "/ebook")
|
| 156 |
out = self._get_output_path("comprimido.pdf")
|
|
@@ -210,88 +208,97 @@ class PDFEngine:
|
|
| 210 |
return out
|
| 211 |
except Exception as e: raise RuntimeError(f"Error: {e}")
|
| 212 |
|
| 213 |
-
# ---
|
| 214 |
def compare_pdfs_text(self, path_a: str, path_b: str) -> str:
|
| 215 |
"""
|
| 216 |
-
Compara el TEXTO de dos PDFs
|
| 217 |
-
|
| 218 |
"""
|
| 219 |
if not path_a or not path_b: raise ValueError("Faltan archivos.")
|
| 220 |
|
| 221 |
-
|
| 222 |
-
def get_text_lines(path):
|
| 223 |
try:
|
| 224 |
reader = PdfReader(path)
|
| 225 |
text = ""
|
| 226 |
for page in reader.pages:
|
| 227 |
extracted = page.extract_text()
|
| 228 |
-
if extracted: text += extracted + "
|
| 229 |
-
#
|
| 230 |
-
return text.
|
| 231 |
except Exception as e:
|
| 232 |
raise RuntimeError(f"Error leyendo PDF: {e}")
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
|
|
|
| 236 |
|
| 237 |
-
#
|
| 238 |
-
diff = difflib.ndiff(
|
| 239 |
|
| 240 |
-
#
|
| 241 |
-
output_path = self._get_output_path("
|
| 242 |
doc = SimpleDocTemplate(output_path, pagesize=A4)
|
| 243 |
styles = getSampleStyleSheet()
|
| 244 |
|
| 245 |
-
#
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
story = []
|
| 252 |
-
story.append(Paragraph("Informe de Comparaci贸n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
story.append(Spacer(1, 12))
|
| 254 |
-
story.append(Paragraph(f"<b>Archivo A (Original):</b> {os.path.basename(path_a)}", style_normal))
|
| 255 |
-
story.append(Paragraph(f"<b>Archivo B (Modificado):</b> {os.path.basename(path_b)}", style_normal))
|
| 256 |
-
story.append(Spacer(1, 24))
|
| 257 |
|
| 258 |
-
#
|
| 259 |
-
#
|
|
|
|
| 260 |
|
| 261 |
-
|
|
|
|
| 262 |
|
| 263 |
-
for
|
| 264 |
-
code =
|
| 265 |
-
|
| 266 |
|
| 267 |
-
# Escapar
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
if code == '- ':
|
| 273 |
-
# Eliminado (Rojo)
|
| 274 |
-
|
| 275 |
-
story.append(p)
|
| 276 |
-
story.append(Spacer(1, 4))
|
| 277 |
-
has_changes = True
|
| 278 |
elif code == '+ ':
|
| 279 |
-
# A帽adido (Verde)
|
| 280 |
-
|
| 281 |
-
story.append(p)
|
| 282 |
-
story.append(Spacer(1, 4))
|
| 283 |
-
has_changes = True
|
| 284 |
elif code == ' ':
|
| 285 |
-
#
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
-
|
| 294 |
-
|
|
|
|
| 295 |
|
| 296 |
doc.build(story)
|
| 297 |
return output_path
|
|
@@ -325,13 +332,11 @@ class PDFEngine:
|
|
| 325 |
try:
|
| 326 |
imgs = convert_from_path(f, dpi=150)
|
| 327 |
prs = Presentation()
|
| 328 |
-
# Layout blanco
|
| 329 |
blank = 6
|
| 330 |
for i, img in enumerate(imgs):
|
| 331 |
ip = self._get_output_path(f"slide_{i}.jpg")
|
| 332 |
img.save(ip, "JPEG")
|
| 333 |
slide = prs.slides.add_slide(prs.slide_layouts[blank])
|
| 334 |
-
# Ajustar imagen al ancho de la slide
|
| 335 |
slide.shapes.add_picture(ip, Inches(0), Inches(0), width=prs.slide_width)
|
| 336 |
out = self._get_output_path(os.path.basename(f).replace(".pdf", ".pptx"))
|
| 337 |
prs.save(out)
|
|
|
|
| 1 |
+
# Versi贸n 2.3: Core con Comparaci贸n Palabra por Palabra (Precisi贸n Alta)
|
| 2 |
# Autor: Gemini (AI Assistant)
|
| 3 |
|
| 4 |
import os
|
|
|
|
| 20 |
from reportlab.lib import colors
|
| 21 |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
| 22 |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 23 |
+
from reportlab.lib.enums import TA_JUSTIFY
|
| 24 |
|
| 25 |
from config import TEMP_DIR
|
| 26 |
|
| 27 |
class PDFEngine:
|
| 28 |
|
| 29 |
+
# --- UTILIDADES INTERNAS ---
|
| 30 |
@staticmethod
|
| 31 |
def _get_output_path(filename: str) -> str:
|
| 32 |
unique_name = f"{uuid.uuid4().hex[:8]}_{filename}"
|
|
|
|
| 41 |
except: return {"pages": 0, "name": "Error", "title": ""}
|
| 42 |
|
| 43 |
def _parse_range_groups(self, range_str: str, max_pages: int) -> list:
|
|
|
|
| 44 |
groups = []
|
| 45 |
parts = range_str.split(',')
|
| 46 |
for part in parts:
|
|
|
|
| 85 |
except: return None
|
| 86 |
|
| 87 |
def get_preview_indices_from_string(self, range_str: str, max_pages: int) -> list:
|
|
|
|
| 88 |
key_pages = []
|
| 89 |
parts = range_str.split(',')
|
| 90 |
for part in parts:
|
|
|
|
| 149 |
|
| 150 |
def compress_pdf(self, file_path: str, power: int = 3) -> str:
|
| 151 |
if not file_path: raise ValueError("Falta archivo.")
|
|
|
|
| 152 |
q = {1: "/prepress", 3: "/ebook", 4: "/screen"}
|
| 153 |
gs_set = q.get(power, "/ebook")
|
| 154 |
out = self._get_output_path("comprimido.pdf")
|
|
|
|
| 208 |
return out
|
| 209 |
except Exception as e: raise RuntimeError(f"Error: {e}")
|
| 210 |
|
| 211 |
+
# --- COMPARACI脫N DE TEXTO PALABRA POR PALABRA (v2.3) ---
|
| 212 |
def compare_pdfs_text(self, path_a: str, path_b: str) -> str:
|
| 213 |
"""
|
| 214 |
+
Compara el TEXTO de dos PDFs tokenizando por PALABRAS.
|
| 215 |
+
Esto evita que l铆neas enteras se marquen como err贸neas si solo cambia una palabra.
|
| 216 |
"""
|
| 217 |
if not path_a or not path_b: raise ValueError("Faltan archivos.")
|
| 218 |
|
| 219 |
+
def get_all_words(path):
|
|
|
|
| 220 |
try:
|
| 221 |
reader = PdfReader(path)
|
| 222 |
text = ""
|
| 223 |
for page in reader.pages:
|
| 224 |
extracted = page.extract_text()
|
| 225 |
+
if extracted: text += extracted + " "
|
| 226 |
+
# Tokenizar por palabras (split elimina espacios extra y saltos de l铆nea)
|
| 227 |
+
return text.split()
|
| 228 |
except Exception as e:
|
| 229 |
raise RuntimeError(f"Error leyendo PDF: {e}")
|
| 230 |
|
| 231 |
+
# Obtenemos listas de palabras ['La', 'casa', 'es', 'roja'...]
|
| 232 |
+
words_a = get_all_words(path_a)
|
| 233 |
+
words_b = get_all_words(path_b)
|
| 234 |
|
| 235 |
+
# Calculamos diferencias palabra por palabra
|
| 236 |
+
diff = difflib.ndiff(words_a, words_b)
|
| 237 |
|
| 238 |
+
# Generar PDF
|
| 239 |
+
output_path = self._get_output_path("informe_diferencias_palabras.pdf")
|
| 240 |
doc = SimpleDocTemplate(output_path, pagesize=A4)
|
| 241 |
styles = getSampleStyleSheet()
|
| 242 |
|
| 243 |
+
# Estilo para el cuerpo del texto
|
| 244 |
+
style_body = ParagraphStyle(
|
| 245 |
+
'Body',
|
| 246 |
+
parent=styles['BodyText'],
|
| 247 |
+
alignment=TA_JUSTIFY,
|
| 248 |
+
fontSize=11,
|
| 249 |
+
leading=14
|
| 250 |
+
)
|
| 251 |
|
| 252 |
story = []
|
| 253 |
+
story.append(Paragraph("Informe de Comparaci贸n (Modo Palabras)", styles['Heading1']))
|
| 254 |
+
story.append(Spacer(1, 12))
|
| 255 |
+
|
| 256 |
+
# Leyenda
|
| 257 |
+
legend = '<b>Leyenda:</b> <font color="red"><strike>Eliminado</strike></font> | <font color="green"><b>A帽adido</b></font> | Texto Com煤n'
|
| 258 |
+
story.append(Paragraph(legend, style_body))
|
| 259 |
+
story.append(Spacer(1, 12))
|
| 260 |
+
story.append(Paragraph(f"<b>A:</b> {os.path.basename(path_a)} | <b>B:</b> {os.path.basename(path_b)}", style_body))
|
| 261 |
story.append(Spacer(1, 12))
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
+
# Reconstrucci贸n del texto
|
| 264 |
+
# Acumularemos fragmentos HTML para crear p谩rrafos.
|
| 265 |
+
# ReportLab tiene l铆mites de tama帽o por p谩rrafo, as铆 que hacemos "flush" cada cierto tiempo.
|
| 266 |
|
| 267 |
+
current_html = ""
|
| 268 |
+
word_count = 0
|
| 269 |
|
| 270 |
+
for token in diff:
|
| 271 |
+
code = token[:2]
|
| 272 |
+
word = token[2:]
|
| 273 |
|
| 274 |
+
# Escapar caracteres especiales XML
|
| 275 |
+
safe_word = word.replace('&', '&').replace('<', '<').replace('>', '>')
|
| 276 |
+
|
| 277 |
+
chunk = ""
|
|
|
|
| 278 |
if code == '- ':
|
| 279 |
+
# Eliminado (Rojo + Tachado)
|
| 280 |
+
chunk = f'<font color="red"><strike>{safe_word}</strike></font> '
|
|
|
|
|
|
|
|
|
|
| 281 |
elif code == '+ ':
|
| 282 |
+
# A帽adido (Verde + Negrita)
|
| 283 |
+
chunk = f'<font color="green"><b>{safe_word}</b></font> '
|
|
|
|
|
|
|
|
|
|
| 284 |
elif code == ' ':
|
| 285 |
+
# Igual (Negro)
|
| 286 |
+
chunk = f'{safe_word} '
|
| 287 |
+
# Ignoramos l铆neas '?'
|
| 288 |
+
|
| 289 |
+
current_html += chunk
|
| 290 |
+
word_count += 1
|
| 291 |
+
|
| 292 |
+
# Crear un nuevo p谩rrafo cada ~300 palabras para evitar problemas de renderizado
|
| 293 |
+
if word_count > 300 and code == ' ': # Cortar preferiblemente en texto normal
|
| 294 |
+
story.append(Paragraph(current_html, style_body))
|
| 295 |
+
story.append(Spacer(1, 6))
|
| 296 |
+
current_html = ""
|
| 297 |
+
word_count = 0
|
| 298 |
|
| 299 |
+
# A帽adir el resto
|
| 300 |
+
if current_html:
|
| 301 |
+
story.append(Paragraph(current_html, style_body))
|
| 302 |
|
| 303 |
doc.build(story)
|
| 304 |
return output_path
|
|
|
|
| 332 |
try:
|
| 333 |
imgs = convert_from_path(f, dpi=150)
|
| 334 |
prs = Presentation()
|
|
|
|
| 335 |
blank = 6
|
| 336 |
for i, img in enumerate(imgs):
|
| 337 |
ip = self._get_output_path(f"slide_{i}.jpg")
|
| 338 |
img.save(ip, "JPEG")
|
| 339 |
slide = prs.slides.add_slide(prs.slide_layouts[blank])
|
|
|
|
| 340 |
slide.shapes.add_picture(ip, Inches(0), Inches(0), width=prs.slide_width)
|
| 341 |
out = self._get_output_path(os.path.basename(f).replace(".pdf", ".pptx"))
|
| 342 |
prs.save(out)
|