Rivalcoder commited on
Commit
a275a62
·
1 Parent(s): 5c309f7
Files changed (1) hide show
  1. kanon_api.py +28 -27
kanon_api.py CHANGED
@@ -2,15 +2,31 @@ import requests
2
  from bs4 import BeautifulSoup
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  BASE_URL = "https://indiankanoon.org"
6
 
 
 
 
7
  def search_cases(query, max_results=10):
8
  """
9
- Scrape search results from Indian Kanoon website.
10
- Returns a list of case URLs and titles.
11
  """
12
  search_url = f"{BASE_URL}/search/?formInput={query}"
13
- response = requests.get(search_url)
14
  response.raise_for_status()
15
 
16
  soup = BeautifulSoup(response.text, "html.parser")
@@ -25,13 +41,15 @@ def search_cases(query, max_results=10):
25
  })
26
  return results
27
 
28
-
 
 
29
  def get_case_content(case_url):
30
  """
31
- Scrape the full text of a case from its URL.
32
  """
33
  try:
34
- response = requests.get(case_url)
35
  response.raise_for_status()
36
  soup = BeautifulSoup(response.text, "html.parser")
37
 
@@ -54,39 +72,22 @@ def get_case_content(case_url):
54
  if paragraphs:
55
  return "\n".join(p.get_text(strip=True) for p in paragraphs)
56
 
57
- except Exception:
58
- return None
59
 
60
  return "No content found."
61
 
62
-
63
- # =========================
64
  # Parallel Case Fetching
65
- # =========================
66
  def fetch_case_text(case):
67
- """
68
- Fetch case content safely for a single case dictionary.
69
- """
70
  case['text'] = get_case_content(case['url'])
71
  return case
72
 
73
  def fetch_cases_parallel(cases, max_workers=5):
74
- """
75
- Fetch multiple cases in parallel using ThreadPoolExecutor.
76
- """
77
  results = []
78
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
79
  futures = {executor.submit(fetch_case_text, case): case for case in cases}
80
  for future in as_completed(futures):
81
  results.append(future.result())
82
  return results
83
-
84
-
85
- # # Example usage
86
- # query = "Cheat in Neet exam"
87
- # cases = search_cases(query, max_results=5)
88
- # # Fetch content in parallel
89
- # cases = fetch_cases_parallel(cases, max_workers=5)
90
- # for case in cases:
91
- # print(f"Title: {case['title']}")
92
- # print(f"Content snippet: {case['text'][:1000]}...\n")
 
2
  from bs4 import BeautifulSoup
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
 
5
+ # --------------------
6
+ # Proxy config (Webshare)
7
+ # --------------------
8
+ PROXIES = {
9
+ "http": "http://uvhnvfjd:ze82v8cwwxpa@198.23.239.134:6540",
10
+ "https": "http://uvhnvfjd:ze82v8cwwxpa@198.23.239.134:6540"
11
+ }
12
+
13
+ HEADERS = {
14
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
15
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
16
+ "Chrome/120.0.0.0 Safari/537.36"
17
+ }
18
+
19
  BASE_URL = "https://indiankanoon.org"
20
 
21
+ # --------------------
22
+ # Search cases
23
+ # --------------------
24
  def search_cases(query, max_results=10):
25
  """
26
+ Scrape search results from Indian Kanoon website via Webshare proxy.
 
27
  """
28
  search_url = f"{BASE_URL}/search/?formInput={query}"
29
+ response = requests.get(search_url, proxies=PROXIES, headers=HEADERS, timeout=30)
30
  response.raise_for_status()
31
 
32
  soup = BeautifulSoup(response.text, "html.parser")
 
41
  })
42
  return results
43
 
44
+ # --------------------
45
+ # Get case content
46
+ # --------------------
47
  def get_case_content(case_url):
48
  """
49
+ Scrape the full text of a case from its URL using proxy.
50
  """
51
  try:
52
+ response = requests.get(case_url, proxies=PROXIES, headers=HEADERS, timeout=30)
53
  response.raise_for_status()
54
  soup = BeautifulSoup(response.text, "html.parser")
55
 
 
72
  if paragraphs:
73
  return "\n".join(p.get_text(strip=True) for p in paragraphs)
74
 
75
+ except Exception as e:
76
+ return f"Error fetching content: {e}"
77
 
78
  return "No content found."
79
 
80
+ # --------------------
 
81
  # Parallel Case Fetching
82
+ # --------------------
83
  def fetch_case_text(case):
 
 
 
84
  case['text'] = get_case_content(case['url'])
85
  return case
86
 
87
  def fetch_cases_parallel(cases, max_workers=5):
 
 
 
88
  results = []
89
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
90
  futures = {executor.submit(fetch_case_text, case): case for case in cases}
91
  for future in as_completed(futures):
92
  results.append(future.result())
93
  return results