Resume_analysis_RAG / create_pdfs.py
DEEPAN-C's picture
Upload folder using huggingface_hub
e4f4981 verified
Raw
History Blame Contribute Delete
1.72 kB
from fpdf import FPDF
import os
def create_pdf_from_txt(txt_path, pdf_path):
with open(txt_path, 'r') as file:
content = file.read()
# Replace bullets and other special characters
content = content.replace('•', '*')
content = content.replace('–', '-')
pdf = FPDF()
pdf.add_page()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.set_font("Arial", size=12)
# Split content into lines and encode to handle special characters
for line in content.split('\n'):
try:
# Try to encode the line to latin1
encoded_line = line.encode('latin-1', 'replace').decode('latin-1')
pdf.cell(0, 10, txt=encoded_line, ln=True)
except Exception:
# If encoding fails, skip the line
continue
pdf.output(pdf_path)
def main():
# Convert job descriptions
job_desc_dir = "JOB_DESCRIPTIONS"
txt_files = [f for f in os.listdir(job_desc_dir) if f.endswith('.txt')]
for txt_file in txt_files:
txt_path = os.path.join(job_desc_dir, txt_file)
pdf_path = os.path.join(job_desc_dir, txt_file.replace('.txt', '.pdf'))
create_pdf_from_txt(txt_path, pdf_path)
os.remove(txt_path)
# Convert remaining resumes if any
resume_dir = "DATA_resume"
txt_files = [f for f in os.listdir(resume_dir) if f.endswith('.txt')]
for txt_file in txt_files:
txt_path = os.path.join(resume_dir, txt_file)
pdf_path = os.path.join(resume_dir, txt_file.replace('.txt', '.pdf'))
create_pdf_from_txt(txt_path, pdf_path)
os.remove(txt_path) # Remove the txt file after creating PDF
if __name__ == "__main__":
main()