File size: 2,646 Bytes
f9d767c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

BASE_URL = "https://indiankanoon.org"

def search_cases(query, max_results=10):
    """
    Scrape search results from Indian Kanoon website.
    Returns a list of case URLs and titles.
    """
    search_url = f"{BASE_URL}/search/?formInput={query}"
    response = requests.get(search_url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    results = []

    for result in soup.select(".result_title")[:max_results]:
        title_tag = result.find("a")
        if title_tag and title_tag.get("href"):
            results.append({
                "title": title_tag.get_text(strip=True),
                "url": BASE_URL + title_tag["href"]
            })
    return results


def get_case_content(case_url):
    """
    Scrape the full text of a case from its URL.
    """
    try:
        response = requests.get(case_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        selectors = [
            "div#maincontent",
            "div.content",
            "pre",
            "div.article_text",
            "div.judgement-text"
        ]

        for sel in selectors:
            content_div = soup.select_one(sel)
            if content_div:
                text = content_div.get_text(separator="\n", strip=True)
                if text:
                    return text

        paragraphs = soup.find_all("p")
        if paragraphs:
            return "\n".join(p.get_text(strip=True) for p in paragraphs)

    except Exception:
        return None

    return "No content found."


# =========================
# Parallel Case Fetching
# =========================
def fetch_case_text(case):
    """
    Fetch case content safely for a single case dictionary.
    """
    case['text'] = get_case_content(case['url'])
    return case

def fetch_cases_parallel(cases, max_workers=5):
    """
    Fetch multiple cases in parallel using ThreadPoolExecutor.
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_case_text, case): case for case in cases}
        for future in as_completed(futures):
            results.append(future.result())
    return results


# # Example usage
# query = "Cheat in Neet exam"
# cases = search_cases(query, max_results=5)
# # Fetch content in parallel
# cases = fetch_cases_parallel(cases, max_workers=5)
# for case in cases:
#     print(f"Title: {case['title']}")
#     print(f"Content snippet: {case['text'][:1000]}...\n")