Rivalcoder commited on
Commit
a745352
·
1 Parent(s): a275a62

Update Old Code

Browse files
__pycache__/app.cpython-312.pyc DELETED
Binary file (1.62 kB)
 
__pycache__/kanon_api.cpython-312.pyc DELETED
Binary file (3.47 kB)
 
__pycache__/predictor.cpython-312.pyc DELETED
Binary file (4.95 kB)
 
__pycache__/vectorstore.cpython-312.pyc DELETED
Binary file (2.27 kB)
 
kanon_api.py CHANGED
@@ -2,31 +2,15 @@ import requests
2
  from bs4 import BeautifulSoup
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
 
5
- # --------------------
6
- # Proxy config (Webshare)
7
- # --------------------
8
- PROXIES = {
9
- "http": "http://uvhnvfjd:ze82v8cwwxpa@198.23.239.134:6540",
10
- "https": "http://uvhnvfjd:ze82v8cwwxpa@198.23.239.134:6540"
11
- }
12
-
13
- HEADERS = {
14
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
15
- "AppleWebKit/537.36 (KHTML, like Gecko) "
16
- "Chrome/120.0.0.0 Safari/537.36"
17
- }
18
-
19
  BASE_URL = "https://indiankanoon.org"
20
 
21
- # --------------------
22
- # Search cases
23
- # --------------------
24
  def search_cases(query, max_results=10):
25
  """
26
- Scrape search results from Indian Kanoon website via Webshare proxy.
 
27
  """
28
  search_url = f"{BASE_URL}/search/?formInput={query}"
29
- response = requests.get(search_url, proxies=PROXIES, headers=HEADERS, timeout=30)
30
  response.raise_for_status()
31
 
32
  soup = BeautifulSoup(response.text, "html.parser")
@@ -41,15 +25,13 @@ def search_cases(query, max_results=10):
41
  })
42
  return results
43
 
44
- # --------------------
45
- # Get case content
46
- # --------------------
47
  def get_case_content(case_url):
48
  """
49
- Scrape the full text of a case from its URL using proxy.
50
  """
51
  try:
52
- response = requests.get(case_url, proxies=PROXIES, headers=HEADERS, timeout=30)
53
  response.raise_for_status()
54
  soup = BeautifulSoup(response.text, "html.parser")
55
 
@@ -72,22 +54,39 @@ def get_case_content(case_url):
72
  if paragraphs:
73
  return "\n".join(p.get_text(strip=True) for p in paragraphs)
74
 
75
- except Exception as e:
76
- return f"Error fetching content: {e}"
77
 
78
  return "No content found."
79
 
80
- # --------------------
 
81
  # Parallel Case Fetching
82
- # --------------------
83
  def fetch_case_text(case):
 
 
 
84
  case['text'] = get_case_content(case['url'])
85
  return case
86
 
87
  def fetch_cases_parallel(cases, max_workers=5):
 
 
 
88
  results = []
89
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
90
  futures = {executor.submit(fetch_case_text, case): case for case in cases}
91
  for future in as_completed(futures):
92
  results.append(future.result())
93
  return results
 
 
 
 
 
 
 
 
 
 
 
2
  from bs4 import BeautifulSoup
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  BASE_URL = "https://indiankanoon.org"
6
 
 
 
 
7
  def search_cases(query, max_results=10):
8
  """
9
+ Scrape search results from Indian Kanoon website.
10
+ Returns a list of case URLs and titles.
11
  """
12
  search_url = f"{BASE_URL}/search/?formInput={query}"
13
+ response = requests.get(search_url)
14
  response.raise_for_status()
15
 
16
  soup = BeautifulSoup(response.text, "html.parser")
 
25
  })
26
  return results
27
 
28
+
 
 
29
  def get_case_content(case_url):
30
  """
31
+ Scrape the full text of a case from its URL.
32
  """
33
  try:
34
+ response = requests.get(case_url)
35
  response.raise_for_status()
36
  soup = BeautifulSoup(response.text, "html.parser")
37
 
 
54
  if paragraphs:
55
  return "\n".join(p.get_text(strip=True) for p in paragraphs)
56
 
57
+ except Exception:
58
+ return None
59
 
60
  return "No content found."
61
 
62
+
63
+ # =========================
64
  # Parallel Case Fetching
65
+ # =========================
66
  def fetch_case_text(case):
67
+ """
68
+ Fetch case content safely for a single case dictionary.
69
+ """
70
  case['text'] = get_case_content(case['url'])
71
  return case
72
 
73
  def fetch_cases_parallel(cases, max_workers=5):
74
+ """
75
+ Fetch multiple cases in parallel using ThreadPoolExecutor.
76
+ """
77
  results = []
78
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
79
  futures = {executor.submit(fetch_case_text, case): case for case in cases}
80
  for future in as_completed(futures):
81
  results.append(future.result())
82
  return results
83
+
84
+
85
+ # # Example usage
86
+ # query = "Cheat in Neet exam"
87
+ # cases = search_cases(query, max_results=5)
88
+ # # Fetch content in parallel
89
+ # cases = fetch_cases_parallel(cases, max_workers=5)
90
+ # for case in cases:
91
+ # print(f"Title: {case['title']}")
92
+ # print(f"Content snippet: {case['text'][:1000]}...\n")