import os os.environ["PLAYWRIGHT_BROWSERS_PATH"] = "/tmp/playwright-browsers" from playwright.async_api import async_playwright from bs4 import BeautifulSoup import asyncio BASE_URL = "https://indiankanoon.org" HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/122.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9", } async def safe_get_content(url: str) -> str: async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() await page.goto(url, wait_until="domcontentloaded") html_content = await page.content() await browser.close() return html_content async def search_cases(query, max_results=10): search_url = f"{BASE_URL}/search/?formInput={query}" html_content = await safe_get_content(search_url) soup = BeautifulSoup(html_content, "html.parser") results = [] for result in soup.select(".result_title")[:max_results]: title_tag = result.find("a") if title_tag and title_tag.get("href"): results.append({ "title": title_tag.get_text(strip=True), "url": BASE_URL + title_tag["href"] }) return results async def get_case_content(case_url): html_content = await safe_get_content(case_url) soup = BeautifulSoup(html_content, "html.parser") selectors = [ "div#maincontent", "div.content", "pre", "div.article_text", "div.judgement-text", ] for sel in selectors: content_div = soup.select_one(sel) if content_div: text = content_div.get_text(separator="\n", strip=True) if text: return text paragraphs = soup.find_all("p") if paragraphs: return "\n".join(p.get_text(strip=True) for p in paragraphs) return "No content found."