Resume-ATS / Process /extract.py
HARISH20205's picture
skills missing elements
a2eb551
import fitz # PyMuPDF
import pytesseract
from PIL import Image
import io
import requests
def extract_text_from_pdf(file_path_or_url):
text = ""
# Check if the file_path_or_url is a URL
if file_path_or_url.startswith(("http://", "https://")):
# Download the PDF file from URL
response = requests.get(file_path_or_url)
if response.status_code != 200:
raise Exception(f"Failed to download the file: {response.status_code}")
# Open the PDF from the downloaded bytes
doc = fitz.open(stream=io.BytesIO(response.content), filetype="pdf")
else:
# Open the PDF from a local file path
doc = fitz.open(file_path_or_url)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# Try to extract text
page_text = page.get_text()
if page_text.strip(): # If text is found
text += page_text
else: # If no text, use OCR
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes("png")))
ocr_text = pytesseract.image_to_string(img)
text += ocr_text
return text
# Example usage with Firebase URL
# firebase_url = "https://firebasestorage.googleapis.com/v0/b/resumeats-50ccf.firebasestorage.app/o/uploads%2Fsanthoshrajan776%40gmail.com%2FSanthoshNatarajan_InternshalaResume%20(1).pdf?alt=media&token=f11f9601-6550-4e64-bba6-a2b699a148af"
# text = extract_text_from_pdf(firebase_url)
# print(text)