File size: 1,495 Bytes
ebed33b
 
 
 
96a84a9
ebed33b
9ac3eaa
96a84a9
ebed33b
9ac3eaa
96a84a9
9ac3eaa
96a84a9
 
 
 
9ac3eaa
96a84a9
e450f6f
96a84a9
 
 
ebed33b
 
a2eb551
ebed33b
 
 
9ac3eaa
ebed33b
 
 
 
 
 
 
 
 
 
9ac3eaa
96a84a9
 
 
ebed33b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import requests


def extract_text_from_pdf(file_path_or_url):
    text = ""

    # Check if the file_path_or_url is a URL
    if file_path_or_url.startswith(("http://", "https://")):
        # Download the PDF file from URL
        response = requests.get(file_path_or_url)
        if response.status_code != 200:
            raise Exception(f"Failed to download the file: {response.status_code}")

        # Open the PDF from the downloaded bytes
        doc = fitz.open(stream=io.BytesIO(response.content), filetype="pdf")
    else:
        # Open the PDF from a local file path
        doc = fitz.open(file_path_or_url)

    for page_num in range(len(doc)):

        page = doc.load_page(page_num)
        # Try to extract text
        page_text = page.get_text()

        if page_text.strip():  # If text is found
            text += page_text
        else:  # If no text, use OCR
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes("png")))
            ocr_text = pytesseract.image_to_string(img)
            text += ocr_text

    return text


# Example usage with Firebase URL
# firebase_url = "https://firebasestorage.googleapis.com/v0/b/resumeats-50ccf.firebasestorage.app/o/uploads%2Fsanthoshrajan776%40gmail.com%2FSanthoshNatarajan_InternshalaResume%20(1).pdf?alt=media&token=f11f9601-6550-4e64-bba6-a2b699a148af"
# text = extract_text_from_pdf(firebase_url)
# print(text)