Spaces:
Sleeping
Sleeping
| from playwright.async_api import async_playwright | |
| from bs4 import BeautifulSoup | |
| import asyncio | |
| BASE_URL = "https://indiankanoon.org" | |
| HEADERS = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/122.0.0.0 Safari/537.36" | |
| ), | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| async def safe_get_content(url: str) -> str: | |
| async with async_playwright() as p: | |
| browser = await p.chromium.launch(headless=True) | |
| context = await browser.new_context() | |
| page = await context.new_page() | |
| await page.goto(url, wait_until="domcontentloaded") | |
| html_content = await page.content() | |
| await browser.close() | |
| return html_content | |
| async def search_cases(query, max_results=10): | |
| search_url = f"{BASE_URL}/search/?formInput={query}" | |
| html_content = await safe_get_content(search_url) | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| results = [] | |
| for result in soup.select(".result_title")[:max_results]: | |
| title_tag = result.find("a") | |
| if title_tag and title_tag.get("href"): | |
| results.append({ | |
| "title": title_tag.get_text(strip=True), | |
| "url": BASE_URL + title_tag["href"] | |
| }) | |
| return results | |
| async def get_case_content(case_url): | |
| html_content = await safe_get_content(case_url) | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| selectors = [ | |
| "div#maincontent", | |
| "div.content", | |
| "pre", | |
| "div.article_text", | |
| "div.judgement-text", | |
| ] | |
| for sel in selectors: | |
| content_div = soup.select_one(sel) | |
| if content_div: | |
| text = content_div.get_text(separator="\n", strip=True) | |
| if text: | |
| return text | |
| paragraphs = soup.find_all("p") | |
| if paragraphs: | |
| return "\n".join(p.get_text(strip=True) for p in paragraphs) | |
| return "No content found." | |