Update app.py
Browse files
app.py
CHANGED
|
@@ -17,9 +17,15 @@ source_url = ""
|
|
| 17 |
def fetch_documentation(url: str) -> str:
|
| 18 |
try:
|
| 19 |
headers = {
|
| 20 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
}
|
| 22 |
-
response = requests.get(url, headers=headers, timeout=
|
| 23 |
response.raise_for_status()
|
| 24 |
|
| 25 |
soup = BeautifulSoup(response.content, 'html.parser')
|
|
@@ -35,7 +41,15 @@ def fetch_documentation(url: str) -> str:
|
|
| 35 |
|
| 36 |
return text
|
| 37 |
except Exception as e:
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
|
| 41 |
sentences = re.split(r'[.!?]+', text)
|
|
@@ -186,9 +200,10 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Documentation RAG System") as demo
|
|
| 186 |
gr.Markdown("### Example URLs to try:")
|
| 187 |
gr.Examples(
|
| 188 |
examples=[
|
|
|
|
| 189 |
["https://docs.python.org/3/tutorial/introduction.html"],
|
|
|
|
| 190 |
["https://pytorch.org/docs/stable/torch.html"],
|
| 191 |
-
["https://huggingface.co/docs/transformers/quicktour"],
|
| 192 |
],
|
| 193 |
inputs=url_input
|
| 194 |
)
|
|
|
|
| 17 |
def fetch_documentation(url: str) -> str:
|
| 18 |
try:
|
| 19 |
headers = {
|
| 20 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 21 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 22 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 23 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 24 |
+
'DNT': '1',
|
| 25 |
+
'Connection': 'keep-alive',
|
| 26 |
+
'Upgrade-Insecure-Requests': '1'
|
| 27 |
}
|
| 28 |
+
response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
|
| 29 |
response.raise_for_status()
|
| 30 |
|
| 31 |
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
| 41 |
|
| 42 |
return text
|
| 43 |
except Exception as e:
|
| 44 |
+
error_msg = str(e)
|
| 45 |
+
if "403" in error_msg or "Forbidden" in error_msg:
|
| 46 |
+
raise Exception(f"Access denied (403 Forbidden). This website blocks automated requests. Try: 1) Using the site's API if available, 2) A different documentation page, 3) GitHub raw content URLs work well (e.g., https://raw.githubusercontent.com/...)")
|
| 47 |
+
elif "404" in error_msg:
|
| 48 |
+
raise Exception(f"Page not found (404). Please check the URL is correct.")
|
| 49 |
+
elif "timeout" in error_msg.lower():
|
| 50 |
+
raise Exception(f"Request timeout. The website took too long to respond.")
|
| 51 |
+
else:
|
| 52 |
+
raise Exception(f"Error fetching URL: {error_msg}")
|
| 53 |
|
| 54 |
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
|
| 55 |
sentences = re.split(r'[.!?]+', text)
|
|
|
|
| 200 |
gr.Markdown("### Example URLs to try:")
|
| 201 |
gr.Examples(
|
| 202 |
examples=[
|
| 203 |
+
["https://raw.githubusercontent.com/python/cpython/main/README.rst"],
|
| 204 |
["https://docs.python.org/3/tutorial/introduction.html"],
|
| 205 |
+
["https://raw.githubusercontent.com/huggingface/transformers/main/README.md"],
|
| 206 |
["https://pytorch.org/docs/stable/torch.html"],
|
|
|
|
| 207 |
],
|
| 208 |
inputs=url_input
|
| 209 |
)
|