PDFConverter-ENG / tools /python /pdf_to_text.py
innoai's picture
Deploy PDF Converter - compiled .NET 8 app
80a3675 verified
Raw
History Blame Contribute Delete
1.25 kB
#!/usr/bin/env python3
"""PDF to Text extractor using PyPDF2."""
import argparse, json, sys
from pathlib import Path
def convert(input_path, output_path):
from PyPDF2 import PdfReader
reader = PdfReader(input_path)
text_parts = []
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
text_parts.append(f"--- Page {i+1} ---\n{text}")
full_text = '\n\n'.join(text_parts)
if not output_path:
output_path = str(Path(input_path).with_suffix('.txt'))
with open(output_path, 'w', encoding='utf-8') as f:
f.write(full_text)
return output_path
def main():
parser = argparse.ArgumentParser(description='Extract text from PDF')
parser.add_argument('--input', required=True)
parser.add_argument('--output', required=True)
args = parser.parse_args()
try:
result = convert(args.input, args.output)
print(json.dumps({"success": True, "output": result, "message": "Text extracted successfully"}))
except Exception as e:
print(json.dumps({"success": False, "output": "", "message": str(e)}))
sys.exit(1)
if __name__ == '__main__':
main()