import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed BASE_URL = "https://indiankanoon.org" def search_cases(query, max_results=10): """ Scrape search results from Indian Kanoon website. Returns a list of case URLs and titles. """ search_url = f"{BASE_URL}/search/?formInput={query}" response = requests.get(search_url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") results = [] for result in soup.select(".result_title")[:max_results]: title_tag = result.find("a") if title_tag and title_tag.get("href"): results.append({ "title": title_tag.get_text(strip=True), "url": BASE_URL + title_tag["href"] }) return results def get_case_content(case_url): """ Scrape the full text of a case from its URL. """ try: response = requests.get(case_url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") selectors = [ "div#maincontent", "div.content", "pre", "div.article_text", "div.judgement-text" ] for sel in selectors: content_div = soup.select_one(sel) if content_div: text = content_div.get_text(separator="\n", strip=True) if text: return text paragraphs = soup.find_all("p") if paragraphs: return "\n".join(p.get_text(strip=True) for p in paragraphs) except Exception: return None return "No content found." # ========================= # Parallel Case Fetching # ========================= def fetch_case_text(case): """ Fetch case content safely for a single case dictionary. """ case['text'] = get_case_content(case['url']) return case def fetch_cases_parallel(cases, max_workers=5): """ Fetch multiple cases in parallel using ThreadPoolExecutor. """ results = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {executor.submit(fetch_case_text, case): case for case in cases} for future in as_completed(futures): results.append(future.result()) return results # # Example usage # query = "Cheat in Neet exam" # cases = search_cases(query, max_results=5) # # Fetch content in parallel # cases = fetch_cases_parallel(cases, max_workers=5) # for case in cases: # print(f"Title: {case['title']}") # print(f"Content snippet: {case['text'][:1000]}...\n")