Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| import tiktoken | |
| # Ensure NLTK resources are available | |
| def extract_text_from_pdf(file): | |
| """ | |
| Extracts text from a PDF file and tracks text by page. | |
| :param file: Uploaded PDF file object. | |
| :return: Tuple (text, page_texts), where: | |
| - text is the combined text of the entire PDF. | |
| - page_texts is a list of tuples [(page_number, page_text), ...]. | |
| """ | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| page_texts = [] | |
| for i, page in enumerate(pdf_reader.pages): | |
| page_content = page.extract_text() | |
| text += page_content | |
| page_texts.append((i + 1, page_content)) # Track page numbers (1-indexed) | |
| return text, page_texts | |
| def count_tokens(string: str) -> int: | |
| """Returns the number of tokens in a text string.""" | |
| encoding = tiktoken.get_encoding("o200k_base") | |
| return len(encoding.encode(string)) | |