Spaces:
Sleeping
Sleeping
| from src.utils.tooling import tool | |
| import PyPDF2 | |
| import re | |
| def analyze_document(file_path: str, keywords: list) -> str: | |
| """ | |
| Extracts specific information from a local PDF or local text document based on given keywords. | |
| (WARNING: This tool does not support URLs or web pages as input.) | |
| Args: | |
| file_path (str): The path to the PDF or text document to analyze. | |
| keywords (list): A list of keywords to search for in the document. | |
| Returns: | |
| str: The extracted information as text. | |
| """ | |
| def extract_text_from_pdf(file_path: str) -> str: | |
| """ | |
| Extracts text from a PDF file. | |
| Args: | |
| file_path (str): The path to the PDF file. | |
| Returns: | |
| str: The extracted text from the PDF. | |
| """ | |
| try: | |
| with open(file_path, 'rb') as file: | |
| reader = PyPDF2.PdfFileReader(file) | |
| text = '' | |
| for page_num in range(reader.numPages): | |
| page = reader.getPage(page_num) | |
| text += page.extract_text() | |
| return text | |
| except Exception as e: | |
| raise Exception(f"Error reading PDF file: {e}") | |
| def extract_text_from_txt(file_path: str) -> str: | |
| """ | |
| Extracts text from a text file. | |
| Args: | |
| file_path (str): The path to the text file. | |
| Returns: | |
| str: The extracted text from the text file. | |
| """ | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| return file.read() | |
| except Exception as e: | |
| raise Exception(f"Error reading text file: {e}") | |
| def extract_information(text: str, keywords: list) -> str: | |
| """ | |
| Extracts information based on keywords from the text. | |
| Args: | |
| text (str): The text to analyze. | |
| keywords (list): A list of keywords to search for in the text. | |
| Returns: | |
| str: The extracted information as text. | |
| """ | |
| extracted_info = [] | |
| for keyword in keywords: | |
| pattern = re.compile(r'\b{}\b'.format(re.escape(keyword)), re.IGNORECASE) | |
| matches = pattern.findall(text) | |
| if matches: | |
| extracted_info.append(f"Keyword '{keyword}': {', '.join(matches)}") | |
| return "\n".join(extracted_info) | |
| if file_path.lower().endswith('.pdf'): | |
| text = extract_text_from_pdf(file_path) | |
| elif file_path.lower().endswith('.txt'): | |
| text = extract_text_from_txt(file_path) | |
| else: | |
| raise ValueError("Unsupported file format. Please provide a PDF or text file.") | |
| return extract_information(text, keywords) |