Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import pytesseract | |
| from PIL import Image | |
| import io | |
| import requests | |
| def extract_text_from_pdf(file_path_or_url): | |
| text = "" | |
| # Check if the file_path_or_url is a URL | |
| if file_path_or_url.startswith(("http://", "https://")): | |
| # Download the PDF file from URL | |
| response = requests.get(file_path_or_url) | |
| if response.status_code != 200: | |
| raise Exception(f"Failed to download the file: {response.status_code}") | |
| # Open the PDF from the downloaded bytes | |
| doc = fitz.open(stream=io.BytesIO(response.content), filetype="pdf") | |
| else: | |
| # Open the PDF from a local file path | |
| doc = fitz.open(file_path_or_url) | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| # Try to extract text | |
| page_text = page.get_text() | |
| if page_text.strip(): # If text is found | |
| text += page_text | |
| else: # If no text, use OCR | |
| pix = page.get_pixmap() | |
| img = Image.open(io.BytesIO(pix.tobytes("png"))) | |
| ocr_text = pytesseract.image_to_string(img) | |
| text += ocr_text | |
| return text | |
| # Example usage with Firebase URL | |
| # firebase_url = "https://firebasestorage.googleapis.com/v0/b/resumeats-50ccf.firebasestorage.app/o/uploads%2Fsanthoshrajan776%40gmail.com%2FSanthoshNatarajan_InternshalaResume%20(1).pdf?alt=media&token=f11f9601-6550-4e64-bba6-a2b699a148af" | |
| # text = extract_text_from_pdf(firebase_url) | |
| # print(text) | |