Rahaf2001 commited on
Commit
cf2feea
·
verified ·
1 Parent(s): 2fc5605

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -4
app.py CHANGED
@@ -17,9 +17,15 @@ source_url = ""
17
  def fetch_documentation(url: str) -> str:
18
  try:
19
  headers = {
20
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
 
 
 
 
 
 
21
  }
22
- response = requests.get(url, headers=headers, timeout=10)
23
  response.raise_for_status()
24
 
25
  soup = BeautifulSoup(response.content, 'html.parser')
@@ -35,7 +41,15 @@ def fetch_documentation(url: str) -> str:
35
 
36
  return text
37
  except Exception as e:
38
- raise Exception(f"Error fetching URL: {str(e)}")
 
 
 
 
 
 
 
 
39
 
40
  def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
41
  sentences = re.split(r'[.!?]+', text)
@@ -186,9 +200,10 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Documentation RAG System") as demo
186
  gr.Markdown("### Example URLs to try:")
187
  gr.Examples(
188
  examples=[
 
189
  ["https://docs.python.org/3/tutorial/introduction.html"],
 
190
  ["https://pytorch.org/docs/stable/torch.html"],
191
- ["https://huggingface.co/docs/transformers/quicktour"],
192
  ],
193
  inputs=url_input
194
  )
 
17
  def fetch_documentation(url: str) -> str:
18
  try:
19
  headers = {
20
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
21
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
22
+ 'Accept-Language': 'en-US,en;q=0.5',
23
+ 'Accept-Encoding': 'gzip, deflate, br',
24
+ 'DNT': '1',
25
+ 'Connection': 'keep-alive',
26
+ 'Upgrade-Insecure-Requests': '1'
27
  }
28
+ response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
29
  response.raise_for_status()
30
 
31
  soup = BeautifulSoup(response.content, 'html.parser')
 
41
 
42
  return text
43
  except Exception as e:
44
+ error_msg = str(e)
45
+ if "403" in error_msg or "Forbidden" in error_msg:
46
+ raise Exception(f"Access denied (403 Forbidden). This website blocks automated requests. Try: 1) Using the site's API if available, 2) A different documentation page, 3) GitHub raw content URLs work well (e.g., https://raw.githubusercontent.com/...)")
47
+ elif "404" in error_msg:
48
+ raise Exception(f"Page not found (404). Please check the URL is correct.")
49
+ elif "timeout" in error_msg.lower():
50
+ raise Exception(f"Request timeout. The website took too long to respond.")
51
+ else:
52
+ raise Exception(f"Error fetching URL: {error_msg}")
53
 
54
  def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
55
  sentences = re.split(r'[.!?]+', text)
 
200
  gr.Markdown("### Example URLs to try:")
201
  gr.Examples(
202
  examples=[
203
+ ["https://raw.githubusercontent.com/python/cpython/main/README.rst"],
204
  ["https://docs.python.org/3/tutorial/introduction.html"],
205
+ ["https://raw.githubusercontent.com/huggingface/transformers/main/README.md"],
206
  ["https://pytorch.org/docs/stable/torch.html"],
 
207
  ],
208
  inputs=url_input
209
  )