Spaces:
Runtime error
Runtime error
| import re | |
| import requests | |
| from markdownify import markdownify | |
| from requests.exceptions import RequestException | |
| import gradio as gr | |
| # Import the Preprocessor class | |
| from utils.preprocessor import Preprocessor | |
| def visit_webpage(url, max_output_length=40000): | |
| """ | |
| Fetch the webpage, convert to markdown, and use Preprocessor methods. | |
| """ | |
| try: | |
| response = requests.get(url, timeout=20) | |
| response.raise_for_status() | |
| markdown_content = markdownify(response.text).strip() | |
| markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) | |
| if len(markdown_content) > max_output_length: | |
| markdown_content = ( | |
| markdown_content[: max_output_length // 2] | |
| + f"\n..._This content has been truncated to stay below {max_output_length} characters_...\n" | |
| + markdown_content[-max_output_length // 2 :] | |
| ) | |
| # Use Preprocessor class methods | |
| section = Preprocessor.extract_section(markdown_content) | |
| dir_paths, files = Preprocessor.extract_dirs_from_text(section) | |
| # Format the result | |
| result = ( | |
| f"paths: {dir_paths}\n\n" | |
| f"files: {files}" | |
| ) | |
| return result | |
| except requests.exceptions.Timeout: | |
| return "The request timed out. Please try again later or check the URL." | |
| except RequestException as e: | |
| return f"Error fetching the webpage: {str(e)}" | |
| except Exception as e: | |
| return f"An unexpected error occurred: {str(e)}" | |
| demo = gr.Interface( | |
| fn=visit_webpage, | |
| inputs=gr.Textbox(label="Website URL"), | |
| outputs=gr.Textbox(label="Extracted Section, Directory Paths, and File Paths"), | |
| title="Webpage Section and Path Extractor" | |
| ) | |
| demo.launch() |