Prediction / kanon_api.py
Rivalcoder
Use of Playright
fbf4182
raw
history blame
1.96 kB
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import asyncio
BASE_URL = "https://indiankanoon.org"
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
}
async def safe_get_content(url: str) -> str:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
await page.goto(url, wait_until="domcontentloaded")
html_content = await page.content()
await browser.close()
return html_content
async def search_cases(query, max_results=10):
search_url = f"{BASE_URL}/search/?formInput={query}"
html_content = await safe_get_content(search_url)
soup = BeautifulSoup(html_content, "html.parser")
results = []
for result in soup.select(".result_title")[:max_results]:
title_tag = result.find("a")
if title_tag and title_tag.get("href"):
results.append({
"title": title_tag.get_text(strip=True),
"url": BASE_URL + title_tag["href"]
})
return results
async def get_case_content(case_url):
html_content = await safe_get_content(case_url)
soup = BeautifulSoup(html_content, "html.parser")
selectors = [
"div#maincontent",
"div.content",
"pre",
"div.article_text",
"div.judgement-text",
]
for sel in selectors:
content_div = soup.select_one(sel)
if content_div:
text = content_div.get_text(separator="\n", strip=True)
if text:
return text
paragraphs = soup.find_all("p")
if paragraphs:
return "\n".join(p.get_text(strip=True) for p in paragraphs)
return "No content found."