Spaces:
Runtime error
Runtime error
| import re | |
| from PyPDF2 import PdfReader | |
| def preprocess_text(text): | |
| # Remove newlines and tabs | |
| text = re.sub(r'\n|\t', '', text) | |
| # Remove letter combinations between spaces | |
| text = re.sub(r'\s[A-Z]\s', ' ', text) | |
| # Remove emails | |
| text = re.sub(r'\S+@\S+', '', text) | |
| # Remove dates in the format DD-MM-YYYY or DD/MM/YYYY | |
| text = re.sub(r'\d{2}[-/]\d{2}[-/]\d{4}', '', text) | |
| # Remove phone numbers | |
| text = re.sub(r'\+\d{2}\s?\d{2,3}\s?\d{3,4}\s?\d{4}', '', text) | |
| # Remove specific text format | |
| text = re.sub(r'Issued\s\w+\s\d{4}Credential ID \w+', '', text) | |
| # Remove extra spaces between words | |
| text = re.sub(r'\s+', ' ', text) | |
| # Add a space before a word containing a capital letter in the middle | |
| text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text) | |
| return text | |
| def get_pdf_text(pdfs,preprocess=True): | |
| if preprocess: | |
| all_text = [] | |
| for pdf in pdfs: | |
| # Process each uploaded PDF file | |
| # Reading PDF | |
| pdf_reader = PdfReader(pdf) | |
| # Get the filename of the PDF | |
| filename = pdf.name | |
| text = "" | |
| # Reading Each Page | |
| for page in pdf_reader.pages: | |
| # Extracting Text in Every Page | |
| text += page.extract_text() | |
| # Preprocess the text | |
| text = preprocess_text(text) | |
| # Appending to array | |
| all_text.append({"filename": filename, "text": text}) | |
| return all_text | |
| else: | |
| text = "" | |
| for pdf in pdfs: | |
| # Process each uploaded PDF file | |
| # Reading PDF | |
| pdf_reader = PdfReader(pdf) | |
| # Reading Each Page | |
| for page in pdf_reader.pages: | |
| # Extracting Text in Every Page | |
| text += page.extract_text() | |
| # text = preprocess_text(text) | |
| return text |