File size: 1,789 Bytes
ec3d86e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# pragent/backend/text_pipeline.py

import asyncio
import sys
import os
from pathlib import Path
import aiofiles.os 
from tqdm.asyncio import tqdm
from pragent.backend.pdf2html import convert_pdf_to_text_only_html
from pragent.backend.html2txt import convert_html_to_txt

# MODIFIED FOR ABLATION STUDY: Added ablation_mode parameter
async def pipeline(pdf_path: str, output_txt_path: str, ablation_mode: str = "none"):
    """
    Defines the complete ASYNCHRONOUS conversion flow from PDF to TXT.
    The ablation_mode parameter is accepted but the primary logic for summarization
    ablation is handled downstream in blog_pipeline.py.
    """
    tqdm.write("--- PDF to TXT Conversion Pipeline Started ---")
    
    pdf_file = Path(pdf_path)
    intermediate_html_path = pdf_file.with_suffix(".temp.html")

    tqdm.write("\n--- Step 1/3: Converting PDF to HTML ---")
    if not await convert_pdf_to_text_only_html(pdf_path, str(intermediate_html_path)):
        tqdm.write("[!] PDF to HTML conversion failed. Aborting pipeline.", file=sys.stderr)
        return

    tqdm.write(f"\n--- Step 2/3: Converting HTML to TXT ---")
    if not await convert_html_to_txt(str(intermediate_html_path), output_txt_path):
        tqdm.write("[!] HTML to TXT conversion failed. Aborting pipeline.", file=sys.stderr)
    else:
        tqdm.write(f"\n[✓] Success! Final text file saved to: {output_txt_path}")

    tqdm.write(f"\n--- Step 3/3: Cleaning up temporary files ---")
    try:
        await aiofiles.os.remove(intermediate_html_path)
        tqdm.write(f"[*] Temporary file '{intermediate_html_path.name}' deleted successfully.")
    except OSError as e:
        tqdm.write(f"[!] Error deleting temporary file: {e}", file=sys.stderr)
        
    tqdm.write("\n--- Pipeline Finished ---")