Spaces:
Runtime error
Runtime error
| import os | |
| from typing import List, Dict, Any | |
| import fitz # PyMuPDF | |
| import docx | |
| import requests | |
| import io | |
| def process_text_file(file_url: str) -> List[Dict[str, Any]]: | |
| _, extension = os.path.splitext(file_url) | |
| extension = extension.lower() | |
| if "?alt=media&token=" in extension: | |
| extension = list(extension.split("?"))[0] | |
| if extension == '.txt': | |
| return process_txt(file_url) | |
| elif extension == '.pdf': | |
| return process_pdf(file_url) | |
| elif extension == '.docx': | |
| return process_docx(file_url) | |
| else: | |
| raise ValueError(f"Unsupported text file type: {extension}") | |
| def process_txt(txt_url: str) -> List[Dict[str, Any]]: | |
| # Fetch the TXT file content from the URL | |
| response = requests.get(txt_url) | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| content = response.text | |
| return [{ | |
| "file_name": os.path.basename(txt_url), | |
| "text": content, | |
| "page_number": 1 | |
| }] | |
| else: | |
| print(f"Failed to fetch the TXT file. Status code: {response.status_code}") | |
| return [] | |
| def process_pdf(pdf_url: str) -> List[Dict[str, Any]]: | |
| # Fetch the PDF file content from the URL | |
| response = requests.get(pdf_url) | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Load the PDF file from the response content | |
| pdf_stream = io.BytesIO(response.content) | |
| # Open the PDF file with PyMuPDF | |
| pdf_document = fitz.open(stream=pdf_stream, filetype="pdf") | |
| # Extract text from all pages | |
| pdf_text = "" | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_num) # Load the page | |
| pdf_text += page.get_text("text") # Extract text from the page | |
| return [{ | |
| "file_name": os.path.basename(pdf_url), | |
| "text": pdf_text | |
| }] | |
| else: | |
| print(f"Failed to fetch the PDF file. Status code: {response.status_code}") | |
| return [] | |
| def process_docx(docx_url: str) -> List[Dict[str, Any]]: | |
| # Fetch the DOCX file content from the URL | |
| response = requests.get(docx_url) | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Load the DOCX file from the response content | |
| docx_stream = io.BytesIO(response.content) | |
| # Open the DOCX file with python-docx | |
| doc = docx.Document(docx_stream) | |
| # Extract text from the DOCX file | |
| content = "\n".join([para.text for para in doc.paragraphs]) | |
| return [{ | |
| "file_name": os.path.basename(docx_url), | |
| "text": content, | |
| "page_number": 1 # DOCX doesn't have pages, so just 1 | |
| }] | |
| else: | |
| print(f"Failed to fetch the DOCX file. Status code: {response.status_code}") | |