Spaces:
Sleeping
Sleeping
File size: 1,964 Bytes
fbf4182 f9d767c fbf4182 f9d767c fbf4182 f9d767c fbf4182 f9d767c fbf4182 f9d767c fbf4182 f9d767c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import asyncio
BASE_URL = "https://indiankanoon.org"
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
}
async def safe_get_content(url: str) -> str:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
await page.goto(url, wait_until="domcontentloaded")
html_content = await page.content()
await browser.close()
return html_content
async def search_cases(query, max_results=10):
search_url = f"{BASE_URL}/search/?formInput={query}"
html_content = await safe_get_content(search_url)
soup = BeautifulSoup(html_content, "html.parser")
results = []
for result in soup.select(".result_title")[:max_results]:
title_tag = result.find("a")
if title_tag and title_tag.get("href"):
results.append({
"title": title_tag.get_text(strip=True),
"url": BASE_URL + title_tag["href"]
})
return results
async def get_case_content(case_url):
html_content = await safe_get_content(case_url)
soup = BeautifulSoup(html_content, "html.parser")
selectors = [
"div#maincontent",
"div.content",
"pre",
"div.article_text",
"div.judgement-text",
]
for sel in selectors:
content_div = soup.select_one(sel)
if content_div:
text = content_div.get_text(separator="\n", strip=True)
if text:
return text
paragraphs = soup.find_all("p")
if paragraphs:
return "\n".join(p.get_text(strip=True) for p in paragraphs)
return "No content found."
|