Nepal_Kanun_Patrika_Scrapper / Kanun_Patrika_Scraper_For_HFSpaces.py
rbbist's picture
Rename Kanun_Patrika_Scrapper_For_HFSpaces.py to Kanun_Patrika_Scraper_For_HFSpaces.py
2a9e5a8 verified
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import sys
import argparse
from urllib.parse import urlencode
import sqlite3
import json
import re
from pathlib import Path
import glob
import nepali_datetime
class LegalCaseScraper:
def __init__(self, output_db="legal_cases_2.db", html_folder="scraped_html"):
self.mudda_type_arr = [
"दुनियाबादी देवानी",
"सरकारबादी देवानी",
"दुनियावादी फौजदारी",
"सरकारवादी फौजदारी",
"रिट",
"निवेदन",
"विविध"
]
self.successful_entries = 0
self.not_entered_links = []
self.still_not_entered_links = []
self.output_db = output_db
self.html_folder = html_folder
# Create HTML folder if it doesn't exist
os.makedirs(self.html_folder, exist_ok=True)
# Initialize SQLite database
self.conn = sqlite3.connect(self.output_db)
self.create_tables()
def create_tables(self):
"""Create SQLite tables for scraped data and failed links"""
cursor = self.conn.cursor()
# Table for scraped case data
cursor.execute('''
CREATE TABLE IF NOT EXISTS cases (
id INTEGER PRIMARY KEY AUTOINCREMENT,
लिङ्क TEXT UNIQUE,
निर्णय_नं TEXT,
भाग TEXT,
मुद्दाको_किसिम TEXT,
साल TEXT,
महिना TEXT,
अंक TEXT,
फैसला_मिति TEXT,
अदालत_वा_इजलास TEXT,
न्यायाधीश TEXT,
आदेश_मिति TEXT,
केस_नम्बर TEXT,
विषय TEXT,
निवेदक TEXT,
विपक्षी TEXT,
प्रकरण TEXT,
ठहर TEXT,
html_file_path TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Table for failed links
cursor.execute('''
CREATE TABLE IF NOT EXISTS failed_links (
id INTEGER PRIMARY KEY AUTOINCREMENT,
मुद्दाको_किसिम TEXT,
साल TEXT,
लिङ्क TEXT,
error_message TEXT,
retry_count INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
self.conn.commit()
def get_mudda_type_number(self, mudda_type):
"""Get mudda type number (1-7) from mudda type name"""
try:
return str(self.mudda_type_arr.index(mudda_type) + 1)
except ValueError:
raise ValueError(f"Invalid mudda_type: {mudda_type}. Must be one of {self.mudda_type_arr}")
def extract_link_number(self, url):
"""Extract the number at the end of the URL"""
match = re.search(r'/(\d+)/?$', url)
return match.group(1) if match else "unknown"
def generate_html_filename(self, url, mudda_type, sal):
"""Generate standardized HTML filename: mudda_number_year_link_number.html"""
mudda_number = self.get_mudda_type_number(mudda_type)
english_sal = self.nepali_sal_to_english_sal(sal)
link_number = self.extract_link_number(url)
return f"{mudda_number}_{english_sal}_{link_number}.html"
def nepali_sal_to_english_sal(self, sal):
"""Convert Nepali numerals to English numerals"""
if not sal:
return ""
nepali_to_english = {
'०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
'५': '5', '६': '6', '७': '7', '८': '8', '९': '9'
}
try:
return ''.join(nepali_to_english.get(char, char) for char in str(sal))
except (TypeError, AttributeError):
raise ValueError(f"Input must be a string containing Nepali numerals, got: {type(sal)}")
def search_url(self, mudda_type, sal):
"""Generate search URL based on mudda_type and sal"""
mudda_types = {name: str(idx + 1) for idx, name in enumerate(self.mudda_type_arr)}
if mudda_type not in mudda_types:
raise ValueError(f"Invalid mudda_type: {mudda_type}. Must be one of {self.mudda_type_arr}")
english_sal = self.nepali_sal_to_english_sal(sal)
base_url = "https://nkp.gov.np/"
params = {
"mudda_number": "",
"faisala_date_from": "",
"faisala_date_to": "",
"mudda_type": mudda_types[mudda_type],
"mudda_name": "",
"badi": "",
"pratibadi": "",
"judge": "",
"ijlas_type": "",
"nirnaya_number": "",
"faisala_type": "",
"keywords": "",
"edition": "",
"year": english_sal,
"month": "",
"volume": "",
"Submit": "खोज्‍नुहोस्"
}
return f"{base_url}?{urlencode(params)}#"
def save_html_file(self, url, html_content, mudda_type, sal):
"""Save HTML content to file with standardized naming"""
filename = self.generate_html_filename(url, mudda_type, sal)
filepath = os.path.join(self.html_folder, filename)
with open(filepath, "w", encoding="utf-8") as f:
f.write(html_content)
return filepath
def load_html_file(self, url, mudda_type, sal):
"""Load HTML content from existing file"""
filename = self.generate_html_filename(url, mudda_type, sal)
filepath = os.path.join(self.html_folder, filename)
if os.path.exists(filepath):
with open(filepath, "r", encoding="utf-8") as f:
return f.read()
return None
def return_soup(self, url, mudda_type=None, sal=None, use_saved=True, max_retries=3):
"""Get soup object from URL or saved HTML file"""
# Try to load from saved file first if requested
if use_saved and mudda_type and sal:
html_content = self.load_html_file(url, mudda_type, sal)
if html_content:
print(f"Using saved HTML file for {url}")
return BeautifulSoup(html_content, 'html.parser')
# Download from web if not found in saved files or use_saved is False
for attempt in range(max_retries):
try:
r = requests.get(url, timeout=30, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
if r.status_code == 200:
r.encoding = 'utf-8'
# Save HTML file if mudda_type and sal are provided
if mudda_type and sal:
filepath = self.save_html_file(url, r.text, mudda_type, sal)
print(f"Saved HTML to: {filepath}")
return BeautifulSoup(r.text, 'html.parser')
else:
print(f"Attempt {attempt + 1}: Failed to retrieve {url}. Status code: {r.status_code}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1}: Error scraping {url}: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
return None
def get_saved_html_files_by_criteria(self, mudda_type=None, sal=None):
"""Get list of saved HTML files matching criteria"""
pattern = "*"
if mudda_type and sal:
mudda_number = self.get_mudda_type_number(mudda_type)
english_sal = self.nepali_sal_to_english_sal(sal)
pattern = f"{mudda_number}_{english_sal}_*.html"
elif sal:
english_sal = self.nepali_sal_to_english_sal(sal)
pattern = f"*_{english_sal}_*.html"
elif mudda_type:
mudda_number = self.get_mudda_type_number(mudda_type)
pattern = f"{mudda_number}_*_*.html"
search_path = os.path.join(self.html_folder, pattern)
return glob.glob(search_path)
def extract_info_from_filename(self, filename):
"""Extract mudda_type, sal, and link_number from filename"""
basename = os.path.basename(filename)
match = re.match(r'(\d+)_(\d+)_(\d+)\.html', basename)
if match:
mudda_number, sal, link_number = match.groups()
mudda_type = self.mudda_type_arr[int(mudda_number) - 1]
return mudda_type, sal, link_number
return None, None, None
def from_each_page(self, links):
"""Extract unique case links from page links"""
li = []
flag = False
i = 0
while(i < len(links)):
href = links[i].get('href')
if href and "#" in href:
i+=1
if i < len(links):
temp_href = links[i].get('href')
if temp_href:
li.append(temp_href)
else:
i+=1
unique_list = []
if(len(li) > 1):
unique_list = list(dict.fromkeys(li))
return unique_list
def get_all_pages(self, initial_url, mudda_type=None, sal=None, use_saved=True):
"""Get all page URLs for pagination"""
soup = self.return_soup(initial_url, mudda_type, sal, use_saved)
if not soup:
return []
links = soup.find_all('a')
all_links = []
other_pages = []
for link in links:
href = link.get('href')
if href:
all_links.append(href)
if "https://nkp.gov.np/advance_search/" in href:
other_pages.append(href)
unique_list = self.from_each_page(links)
# Handle pagination
if "javascript:void(0)" in all_links and other_pages:
mx = 0
for j in other_pages:
temp = ""
for i in range(len(j)-1, -1, -1):
if j[i] == "=":
break
temp = j[i] + temp
try:
temp2 = int(temp)
if mx < temp2:
mx = temp2
except ValueError:
continue
if mx > 0:
st = other_pages[0][:-2]
real_other_pages = []
for i in range(20, mx + 1, 20):
real_other_pages.append(st + str(i))
unique_list2 = []
for page_url in real_other_pages:
print(f"Processing page: {page_url}")
try:
page_soup = self.return_soup(page_url, mudda_type, sal, use_saved)
if page_soup:
page_links = page_soup.find_all('a')
unique_list2 += self.from_each_page(page_links)
except Exception as e:
print(f"Error scraping page {page_url}: {e}")
unique_list += unique_list2
# Remove duplicates
unique_unique_list = list(dict.fromkeys(unique_list))
return unique_unique_list
def get_edition_field(self, soup, label):
"""Extract edition field from soup"""
edition_info = soup.find("div", id="edition-info")
if edition_info:
for span in edition_info.find_all("span"):
if label in span.text:
strong = span.find("strong")
return strong.text.strip() if strong else None
return None
def determine_scraper_method(self, sal):
"""Determine which scraper method to use based on year"""
eng_sal = int(self.nepali_sal_to_english_sal(sal))
today = nepali_datetime.date.today()
latest_nepali_year = int(today.year)
if 2015 <= eng_sal <= 2044:
return self.scrape_case_details_2015_to_2044
elif 2045 <= eng_sal <= 2050:
return self.scrape_case_details_2045_to_2050
elif 2051 <= eng_sal <= 2061:
return self.scrape_case_details_2051_to_2061
elif 2062 <= eng_sal <= 2072:
return self.scrape_case_details_2062_to_2072
elif 2073 <= eng_sal < latest_nepali_year:
return self.scrape_case_details_2073_to_2080_and_beyond
else:
raise ValueError(f"No scraper method available for year {eng_sal} or those records not yet available in Nepal Kanun Patrika Website")
def scrape_case_details_generic(self, url, mudda_type, sal, use_saved=True):
"""Generic method that routes to the appropriate scraper based on year"""
try:
scraper_method = self.determine_scraper_method(sal)
return scraper_method(url, mudda_type, sal, use_saved)
except ValueError as e:
print(f"Error: {e}")
return False
# [Previous scraper methods with modifications for HTML file handling]
def scrape_case_details_2015_to_2044(self, url, mudda_type, sal=None, use_saved=True):
"""Scrape details from a single case URL (2015-2044)"""
try:
cursor = self.conn.cursor()
cursor.execute('SELECT लिङ्क FROM cases WHERE लिङ्क = ?', (url,))
if cursor.fetchone():
print(f"URL {url} already exists in database, skipping...")
return True
# Get soup using saved HTML or web
soup = self.return_soup(url, mudda_type, sal, use_saved)
if not soup:
print(f"Failed to get content for {url}")
return False
# Extract basic information
title_tag = soup.find("h1", class_="post-title")
decision_title = title_tag.get_text(strip=True).split()[2] if title_tag and len(title_tag.get_text(strip=True).split()) > 2 else "N/A"
bhaag = self.get_edition_field(soup, "भाग")
saal = self.get_edition_field(soup, "साल")
mahina = self.get_edition_field(soup, "महिना")
anka = self.get_edition_field(soup, "अंक")
# Extract decision date
post_meta = soup.find("div", class_="post-meta")
decision_date = "N/A"
if post_meta and "फैसला मिति" in post_meta.text:
try:
decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0]
except IndexError:
decision_date = "N/A"
# Extract detailed information
div_tag = soup.find("div", id="faisala_detail ")
details = {}
if div_tag:
tags = div_tag.find_all(['h1', 'p'])
n = len(tags)
ind = 0
temp_ind_32 = ind
KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."]
KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"]
KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"]
KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"]
KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"]
KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"]
KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"]
KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"]
KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"]
# Extract court information
temp_ijlash = ""
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if any(kw == text for kw in KEYWORDS_6):
if "निर्णय नं." not in temp_ijlash:
details["इजलास"] = temp_ijlash
ind+=1
break
elif any(kw in text for kw in KEYWORDS_6):
details["इजलास"] = text
ind+=1
text_2 = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text_2 for kw in KEYWORDS_8) == False:
details["इजलास"] = text +" "+ text_2
ind+=1
break
elif any(kw in text for kw in KEYWORDS_8):
if "निर्णय नं." not in temp_ijlash:
details["इजलास"] = temp_ijlash
break
temp_ijlash = text
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract judges
judges = []
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if any(kw in text for kw in KEYWORDS_8):
judges.append(text)
else:
details["न्यायाधीश"] = judges
if any(kw in text for kw in KEYWORDS_10):
details["केस_नम्बर"] = text
ind+=1
elif any(kw2 in text for kw2 in KEYWORDS_3) == False and any(kw2 in text for kw2 in KEYWORDS_5) == False:
details["केस_नम्बर"] = text
ind+=1
break
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Standard case structure
bisaya_before_niweduck = False
temp_ind_64 = ind
while temp_ind_64 < n:
text = tags[temp_ind_64].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_3) or any(kw in text for kw in KEYWORDS_4):
break
if any(kw in text for kw in KEYWORDS_5):
bisaya_before_niweduck = True
break
temp_ind_64+=1
if bisaya_before_niweduck:
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(text.startswith(kw) for kw in KEYWORDS_5):
details["विषय"] = text
ind+=1
break
if any(kw in text for kw in KEYWORDS_3):
ind = temp_ind_32
break
ind+=1
else:
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text):
details["आदेश मिति"] = text
ind+=1
break
if any(kw in text for kw in KEYWORDS_3):
ind = temp_ind_32
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_3):
if any(kw2 == text for kw2 in KEYWORDS_3):
ind += 1
text = tags[ind].get_text(separator=' ', strip=True)
details["निवेदक"] = text
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_4):
if any(kw2 == text for kw2 in KEYWORDS_4):
ind += 1
text = tags[ind].get_text(separator=' ', strip=True)
details["विपक्षी"] = text
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
if bisaya_before_niweduck==False:
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(text.startswith(kw) for kw in KEYWORDS_5):
details["विषय"] = text
ind+=1
break
if any(kw in text for kw in KEYWORDS_2):
ind = temp_ind_32
break
ind+=1
else:
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text):
details["आदेश मिति"] = text
ind+=1
break
if any(kw in text for kw in KEYWORDS_2):
ind = temp_ind_32
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract prakarans and tahar
prakarans = []
prev = ""
tahar = []
temp_flag_tahar = False
for tag in tags[ind:]:
text = tag.get_text(separator=' ', strip=True)
if text:
#if "§" in text or any(kw in text for kw in KEYWORDS_2):
#prakarans.append(text)
if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if any(text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 23: Only append if prev has content
prakarans.append(prev)
prakarans.append(text)
prev = ""
if "§" in text:
prakarans.append(text)
if "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if not prakarans:
prakarans.append(prev)
else:
prev = prev + " " + text if prev else text
if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(text)
# Process list items
next_sib = tag.find_next_sibling()
while next_sib and next_sib.name in ['ul', 'ol']:
for li in next_sib.find_all('li'):
li_text = li.get_text(separator=' ', strip=True)
if li_text:
if any(li_text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 24: Only append if prev has content
prakarans.append(prev)
prakarans.append(li_text)
prev = ""
else:
prev = prev + " " + li_text if prev else li_text
if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(li_text)
next_sib = next_sib.find_next_sibling()
details["प्रकरण"] = prakarans
details["ठहर"] = tahar
# Get HTML file path
html_file_path = ""
if mudda_type and sal:
filename = self.generate_html_filename(url, mudda_type, sal)
html_file_path = os.path.join(self.html_folder, filename)
# Combine all data
data = {
"लिङ्क": url,
"निर्णय नं.": decision_title,
"भाग": bhaag or "N/A",
"मुद्दाको किसिम": mudda_type,
"साल": saal or "N/A",
"महिना": mahina or "N/A",
"अंक": anka or "N/A",
"फैसला मिति": f"'{decision_date}'",
"अदालत / इजलास": details.get("इजलास", "N/A"),
"न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False),
"आदेश मिति": details.get("आदेश मिति", "N/A"),
"केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"),
"विषय": details.get("विषय", "N/A"),
"निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"),
"विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"),
"प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False),
"ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False),
"html_file_path": html_file_path
}
# Save to SQLite
self.save_to_sqlite(data)
print(f"{url} - Successfully Scraped and Entered")
return True
except Exception as e:
print(f"Error scraping {url}: {e}")
return False
def scrape_case_details_2045_to_2050(self, url, mudda_type, sal = None, use_saved=True): # CHANGE 4: Remove output_db parameter
"""Scrape details from a single case URL"""
try:
# IMPROVEMENT 16: Check if URL already exists in database
cursor = self.conn.cursor()
cursor.execute('SELECT लिङ्क FROM cases WHERE लिङ्क = ?', (url,))
if cursor.fetchone():
print(f"URL {url} already exists in database, skipping...")
return True
# Get soup using saved HTML or web
soup = self.return_soup(url, mudda_type, sal, use_saved)
if not soup:
print(f"Failed to get content for {url}")
return False
# Extract basic information
title_tag = soup.find("h1", class_="post-title")
decision_title = title_tag.get_text(strip=True).split()[2] if title_tag and len(title_tag.get_text(strip=True).split()) > 2 else "N/A" # IMPROVEMENT 19: Bounds checking
bhaag = self.get_edition_field(soup, "भाग")
saal = self.get_edition_field(soup, "साल")
mahina = self.get_edition_field(soup, "महिना")
anka = self.get_edition_field(soup, "अंक")
# Extract decision date
post_meta = soup.find("div", class_="post-meta")
decision_date = "N/A"
if post_meta and "फैसला मिति" in post_meta.text:
try: # IMPROVEMENT 20: Better error handling for date extraction
decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0]
except IndexError:
decision_date = "N/A"
# Extract detailed information
div_tag = soup.find("div", id="faisala_detail ")
details = {}
if div_tag:
tags = div_tag.find_all(['h1', 'p'])
n = len(tags)
ind = 0
temp_ind_32 = ind
KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."]
KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"]
KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"]
KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"]
KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"]
KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"]
KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"]
KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"]
KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"]
# Extract court information
temp_ijlash = ""
while(ind < n):
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if any(kw == text for kw in KEYWORDS_6):
details["इजलास"] = temp_ijlash
ind+=1
break
elif any(kw in text for kw in KEYWORDS_6):
details["इजलास"] = text
ind+=1
break
elif "न्यायाधीश" in text or "माननीय" in text:
details["इजलास"] = temp_ijlash
break
temp_ijlash = text
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract judges
judges = []
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if "न्यायाधीश" in text or "माननीय" in text:
judges.append(text)
elif any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text):
details["न्यायाधीश"] = judges
details["आदेश मिति"] = text
ind += 1
break
else:
details["केस_नम्बर"] = text
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Standard case structure
bisaya_before_niweduck = False
details["विषय"] = ""
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_3) or any(kw in text for kw in KEYWORDS_4):
break
if any(kw in text for kw in KEYWORDS_5):
bisaya_before_niweduck = True
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
if bisaya_before_niweduck:
while ind < n:
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_5):
details["विषय"] = text
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
#temp_Ind = ind
while ind < n:
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_3):
if any(kw2 == text for kw2 in KEYWORDS_3):
ind += 1
text = tags[ind].get_text(separator=' ', strip=True)
details["निवेदक"] = text
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
while ind < n:
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_4):
if any(kw2 == text for kw2 in KEYWORDS_4):
ind += 1
text = tags[ind].get_text(separator=' ', strip=True)
details["विपक्षी"] = text
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
if bisaya_before_niweduck==False:
while ind < n:
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_5):
details["विषय"] = text
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract prakarans and tahar
prakarans = []
prev = ""
tahar = []
temp_flag_tahar = False
for tag in tags[ind:]:
text = tag.get_text(separator=' ', strip=True)
if text:
#if "§" in text or any(kw in text for kw in KEYWORDS_2):
#prakarans.append(text)
if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if any(text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 23: Only append if prev has content
prakarans.append(prev)
prakarans.append(text)
prev = ""
if "§" in text:
prakarans.append(text)
if "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if not prakarans:
prakarans.append(prev)
else:
prev = prev + " " + text if prev else text
if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(text)
# Process list items
next_sib = tag.find_next_sibling()
while next_sib and next_sib.name in ['ul', 'ol']:
for li in next_sib.find_all('li'):
li_text = li.get_text(separator=' ', strip=True)
if li_text:
if any(li_text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 24: Only append if prev has content
prakarans.append(prev)
prakarans.append(li_text)
prev = ""
else:
prev = prev + " " + li_text if prev else li_text
if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(li_text)
next_sib = next_sib.find_next_sibling()
details["प्रकरण"] = prakarans
details["ठहर"] = tahar
# Get HTML file path
html_file_path = ""
if mudda_type and sal:
filename = self.generate_html_filename(url, mudda_type, sal)
html_file_path = os.path.join(self.html_folder, filename)
# Combine all data, handling lists and strings appropriately
data = {
"लिङ्क": url,
"निर्णय नं.": decision_title,
"भाग": bhaag or "N/A", # IMPROVEMENT 25: Handle None values
"मुद्दाको किसिम": mudda_type,
"साल": saal or "N/A",
"महिना": mahina or "N/A",
"अंक": anka or "N/A",
"फैसला मिति": f"'{decision_date}'",
"अदालत / इजलास": details.get("इजलास", "N/A"),
"न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False),
"आदेश मिति": details.get("आदेश मिति", "N/A"),
"केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"),
"विषय": details.get("विषय", "N/A"),
"निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"),
"विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"),
"प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False),
"ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False),
"html_file_path": html_file_path
}
# Save to SQLite
self.save_to_sqlite(data)
print(f"{url} - Successfully Scraped and Entered")
return True
except Exception as e:
print(f"Error scraping {url}: {e}")
return False
def scrape_case_details_2051_to_2061(self, url, mudda_type, sal = None, use_saved=True): # CHANGE 4: Remove output_db parameter
"""Scrape details from a single case URL"""
try:
# IMPROVEMENT 16: Check if URL already exists in database
cursor = self.conn.cursor()
cursor.execute('SELECT लिङ्क FROM cases WHERE लिङ्क = ?', (url,))
if cursor.fetchone():
print(f"URL {url} already exists in database, skipping...")
return True
# Get soup using saved HTML or web
soup = self.return_soup(url, mudda_type, sal, use_saved)
if not soup:
print(f"Failed to get content for {url}")
return False
# Extract basic information
title_tag = soup.find("h1", class_="post-title")
decision_title = title_tag.get_text(strip=True).split()[2] if title_tag and len(title_tag.get_text(strip=True).split()) > 2 else "N/A" # IMPROVEMENT 19: Bounds checking
bhaag = self.get_edition_field(soup, "भाग")
saal = self.get_edition_field(soup, "साल")
mahina = self.get_edition_field(soup, "महिना")
anka = self.get_edition_field(soup, "अंक")
# Extract decision date
post_meta = soup.find("div", class_="post-meta")
decision_date = "N/A"
if post_meta and "फैसला मिति" in post_meta.text:
try: # IMPROVEMENT 20: Better error handling for date extraction
decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0]
except IndexError:
decision_date = "N/A"
# Extract detailed information
div_tag = soup.find("div", id="faisala_detail ")
details = {}
if div_tag:
tags = div_tag.find_all(['h1', 'p'])
n = len(tags)
ind = 0
temp_ind_32 = ind
KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."]
KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"]
KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"]
KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"]
KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"]
KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"]
KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"]
KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"]
KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"]
# Extract court information
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if text and ("इजलास" in text or "इजालास" in text):
details["इजलास"] = text
ind += 1
break
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract judges
judges = []
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if "न्यायाधीश" in text or "माननीय" in text:
judges.append(text)
elif any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text):
details["न्यायाधीश"] = judges
details["आदेश मिति"] = text
ind += 1
break
else:
details["केस_नम्बर"] = text
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Standard case structure
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if "विषय" in text or "मुद्दा" in text or "बिषय" in text or "मूद्दाः" in text:
details["विषय"] = text
ind += 1
break
ind += 1
if ind >= n:
ind = temp_ind_32
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_3):
if any(kw2 == text for kw2 in KEYWORDS_3):
ind += 1
if ind < n: # IMPROVEMENT 21: Bounds checking
text = tags[ind].get_text(separator=' ', strip=True)
details["निवेदक"] = text
ind += 1
break
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_4):
if any(kw2 == text for kw2 in KEYWORDS_4):
ind += 1
if ind < n: # IMPROVEMENT 22: Bounds checking
text = tags[ind].get_text(separator=' ', strip=True)
details["विपक्षी"] = text
ind += 1
break
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract prakarans and tahar
prakarans = []
prev = ""
tahar = []
temp_flag_tahar = False
for tag in tags[ind:]:
text = tag.get_text(separator=' ', strip=True)
if text:
#if "§" in text or any(kw in text for kw in KEYWORDS_2):
#prakarans.append(text)
if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if any(text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 23: Only append if prev has content
prakarans.append(prev)
prakarans.append(text)
prev = ""
if "§" in text:
prakarans.append(text)
if "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if not prakarans:
prakarans.append(prev)
else:
prev = prev + " " + text if prev else text
if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(text)
# Process list items
next_sib = tag.find_next_sibling()
while next_sib and next_sib.name in ['ul', 'ol']:
for li in next_sib.find_all('li'):
li_text = li.get_text(separator=' ', strip=True)
if li_text:
if any(li_text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 24: Only append if prev has content
prakarans.append(prev)
prakarans.append(li_text)
prev = ""
else:
prev = prev + " " + li_text if prev else li_text
if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(li_text)
next_sib = next_sib.find_next_sibling()
details["प्रकरण"] = prakarans
details["ठहर"] = tahar
# Get HTML file path
html_file_path = ""
if mudda_type and sal:
filename = self.generate_html_filename(url, mudda_type, sal)
html_file_path = os.path.join(self.html_folder, filename)
# Combine all data, handling lists and strings appropriately
data = {
"लिङ्क": url,
"निर्णय नं.": decision_title,
"भाग": bhaag or "N/A", # IMPROVEMENT 25: Handle None values
"मुद्दाको किसिम": mudda_type,
"साल": saal or "N/A",
"महिना": mahina or "N/A",
"अंक": anka or "N/A",
"फैसला मिति": f"'{decision_date}'",
"अदालत / इजलास": details.get("इजलास", "N/A"),
"न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False),
"आदेश मिति": details.get("आदेश मिति", "N/A"),
"केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"),
"विषय": details.get("विषय", "N/A"),
"निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"),
"विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"),
"प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False),
"ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False),
"html_file_path": html_file_path
}
# Save to SQLite
self.save_to_sqlite(data)
print(f"{url} - Successfully Scraped and Entered")
return True
except Exception as e:
print(f"Error scraping {url}: {e}")
return False
def scrape_case_details_2062_to_2072(self, url, mudda_type, sal = None, use_saved=True): # CHANGE 4: Remove output_db parameter
"""Scrape details from a single case URL"""
try:
# IMPROVEMENT 16: Check if URL already exists in database
cursor = self.conn.cursor()
cursor.execute('SELECT लिङ्क FROM cases WHERE लिङ्क = ?', (url,))
if cursor.fetchone():
print(f"URL {url} already exists in database, skipping...")
return True
# Get soup using saved HTML or web
soup = self.return_soup(url, mudda_type, sal, use_saved)
if not soup:
print(f"Failed to get content for {url}")
return False
# Extract basic information
title_tag = soup.find("h1", class_="post-title")
decision_title = title_tag.get_text(strip=True).split()[2] if title_tag and len(title_tag.get_text(strip=True).split()) > 2 else "N/A" # IMPROVEMENT 19: Bounds checking
bhaag = self.get_edition_field(soup, "भाग")
saal = self.get_edition_field(soup, "साल")
mahina = self.get_edition_field(soup, "महिना")
anka = self.get_edition_field(soup, "अंक")
# Extract decision date
post_meta = soup.find("div", class_="post-meta")
decision_date = "N/A"
if post_meta and "फैसला मिति" in post_meta.text:
try: # IMPROVEMENT 20: Better error handling for date extraction
decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0]
except IndexError:
decision_date = "N/A"
# Extract detailed information
div_tag = soup.find("div", id="faisala_detail ")
details = {}
if div_tag:
tags = div_tag.find_all(['h1', 'p'])
n = len(tags)
ind = 0
temp_ind_32 = ind
KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."]
KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"]
KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"]
KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"]
KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"]
KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"]
KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"]
KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"]
KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"]
# Extract court information
temp_ijlash = ""
while(ind < n):
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if any(kw == text for kw in KEYWORDS_6):
if "निर्णय नं." not in temp_ijlash:
details["अदालत / इजलास"] = temp_ijlash
ind+=1
break
elif any(kw in text for kw in KEYWORDS_6):
details["अदालत / इजलास"] = text
ind+=1
text_2 = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text_2 for kw in KEYWORDS_8) == False:
details["अदालत / इजलास"] = text +" "+ text_2
ind+=1
break
elif any(kw in text for kw in KEYWORDS_8):
if "निर्णय नं." not in temp_ijlash:
details["अदालत / इजलास"] = temp_ijlash
break
temp_ijlash = text
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract judges
judges = []
faisla_miti_before_case_no = False
subject_before_case_no = False
while(ind < n):
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if any(kw in text for kw in KEYWORDS_8):
judges.append(text)
else:
details["न्यायाधीश"] = judges
if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text):
details["आदेश मिति"] = text
ind+=1
faisla_miti_before_case_no = True
elif any(kw in text for kw in KEYWORDS_10):
details["केस_नम्बर"] = text
elif any(kw2 in text for kw2 in KEYWORDS_3) == False and any(kw2 in text for kw2 in KEYWORDS_5) == False:
if text!="फैसला":
details["केस_नम्बर"] = text
else:
ind+=1
details["केस_नम्बर"] = tags[ind].get_text(separator=' ', strip=True)
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Standard case structure
if faisla_miti_before_case_no:
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if any(kw in text for kw in KEYWORDS_10):
details["केस_नम्बर"] = text
elif any(text.startswith(kw) for kw in KEYWORDS_5):
subject_before_case_no = True
details["विषय"] = text
else:
details["केस_नम्बर"] = text
ind+=1
break
ind+=1
else:
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text):
details["आदेश मिति"] = text
ind+=1
break
if any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
ind = temp_ind_32
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
if subject_before_case_no:
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if text:
details["केस_नम्बर"] = text
ind+=1
break
ind+=1
else:
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if text:
if any(text.startswith(kw) for kw in KEYWORDS_5):
details["विषय"] = text
ind+=1
break
if any(kw in text for kw in KEYWORDS_3):
ind = temp_ind_32
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
temp_ind_64 = ind
count_how_many = 0
while temp_ind_64 < n:
text = tags[temp_ind_64].get_text(separator=' ', strip=True)
if text and any(kw == text for kw in KEYWORDS_9):
count_how_many += 1
if any(text.startswith(kw) for kw in KEYWORDS_2):
break
temp_ind_64+=1
if count_how_many > 1:
case_no = []
appellant = []
opposition = []
while count_how_many > 0:
while ind < n:
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_3):
if any(kw2 == text for kw2 in KEYWORDS_3):
ind += 1
text = tags[ind].get_text(separator=' ', strip=True)
appellant.append(text)
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
while ind < n:
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_4):
if any(kw2 == text for kw2 in KEYWORDS_4):
ind += 1
text = tags[ind].get_text(separator=' ', strip=True)
opposition.append(text)
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
count_how_many-=1
temp_ind_128 = 0
while temp_ind_128 < n:
text = tags[temp_ind_128].get_text(separator=' ', strip=True)
if text:
if any(kw in text for kw in KEYWORDS_10):
case_no.append(text)
if any(text.startswith(kw) for kw in KEYWORDS_2) or any(kw == text for kw in KEYWORDS_7):
break
temp_ind_128 += 1
details["केस_नम्बर"] = case_no
details["निवेदक"] = appellant
details["विपक्षी"] = opposition
else:
while ind < n:
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_3):
if any(kw2 == text for kw2 in KEYWORDS_3):
ind += 1
text = tags[ind].get_text(separator=' ', strip=True)
details["निवेदक"] = text
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
while ind < n:
#text = p_tags[ind].get_text(strip=True)
text = tags[ind].get_text(separator=' ', strip=True)
if any(kw in text for kw in KEYWORDS_4):
if any(kw2 == text for kw2 in KEYWORDS_4):
ind += 1
text = tags[ind].get_text(separator=' ', strip=True)
details["विपक्षी"] = text
ind+=1
break
ind+=1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract prakarans and tahar
prakarans = []
prev = ""
tahar = []
temp_flag_tahar = False
for tag in tags[ind:]:
text = tag.get_text(separator=' ', strip=True)
if text:
#if "§" in text or any(kw in text for kw in KEYWORDS_2):
#prakarans.append(text)
if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if any(text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 23: Only append if prev has content
prakarans.append(prev)
prakarans.append(text)
prev = ""
if "§" in text:
prakarans.append(text)
if "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if not prakarans:
prakarans.append(prev)
else:
prev = prev + " " + text if prev else text
if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(text)
# Process list items
next_sib = tag.find_next_sibling()
while next_sib and next_sib.name in ['ul', 'ol']:
for li in next_sib.find_all('li'):
li_text = li.get_text(separator=' ', strip=True)
if li_text:
if any(li_text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 24: Only append if prev has content
prakarans.append(prev)
prakarans.append(li_text)
prev = ""
else:
prev = prev + " " + li_text if prev else li_text
if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(li_text)
next_sib = next_sib.find_next_sibling()
details["प्रकरण"] = prakarans
details["ठहर"] = tahar
# Get HTML file path
html_file_path = ""
if mudda_type and sal:
filename = self.generate_html_filename(url, mudda_type, sal)
html_file_path = os.path.join(self.html_folder, filename)
# Combine all data, handling lists and strings appropriately
data = {
"लिङ्क": url,
"निर्णय नं.": decision_title,
"भाग": bhaag or "N/A", # IMPROVEMENT 25: Handle None values
"मुद्दाको किसिम": mudda_type,
"साल": saal or "N/A",
"महिना": mahina or "N/A",
"अंक": anka or "N/A",
"फैसला मिति": f"'{decision_date}'",
"अदालत / इजलास": details.get("अदालत / इजलास", "N/A"),
"न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False),
"आदेश मिति": details.get("आदेश मिति", "N/A"),
"केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"),
"विषय": details.get("विषय", "N/A"),
"निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"),
"विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"),
"प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False),
"ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False),
"html_file_path": html_file_path
}
# Save to SQLite
self.save_to_sqlite(data)
print(f"{url} - Successfully Scraped and Entered")
return True
except Exception as e:
print(f"Error scraping {url}: {e}")
return False
def scrape_case_details_2073_to_2080_and_beyond(self, url, mudda_type, sal = None, use_saved=True):
"""Scrape details from a single case URL"""
try:
r = requests.get(url, timeout=15)
if r.status_code != 200:
print(f"Failed to retrieve {url}, Status code: {r.status_code}")
return False
# Get soup using saved HTML or web
soup = self.return_soup(url, mudda_type, sal, use_saved)
if not soup:
print(f"Failed to get content for {url}")
return False
# Extract basic information
title_tag = soup.find("h1", class_="post-title")
decision_title = title_tag.get_text(strip=True).split()[2] if title_tag else "N/A"
bhaag = self.get_edition_field(soup, "भाग")
saal = self.get_edition_field(soup, "साल")
mahina = self.get_edition_field(soup, "महिना")
anka = self.get_edition_field(soup, "अंक")
# Extract decision date
post_meta = soup.find("div", class_="post-meta")
decision_date = "N/A"
if post_meta and "फैसला मिति" in post_meta.text:
decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0]
# Extract detailed information
div_tag = soup.find("div", id="faisala_detail ")
details = {}
if div_tag:
tags = div_tag.find_all(['h1', 'p'])
n = len(tags)
ind = 0
temp_ind_32 = ind
#KEYWORDS_2 = ["(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र.नं."]
KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."]
KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"]
KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"]
KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"]
KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"]
KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"]
KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"]
KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"]
KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"]
# Extract court information
while ind < n:
text = tags[ind].get_text(strip=True)
if text and any(kw in text for kw in KEYWORDS_6):
details["अदालत / इजलास"] = text
ind += 1
break
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract judges
judges = []
while ind < n:
text = tags[ind].get_text(strip=True)
if text:
if any(kw in text for kw in KEYWORDS_8):
judges.append(text)
if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text):
details["न्यायाधीश"] = judges
details["आदेश मिति"] = text
ind += 1
break
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Extract case details
bisaya_before_kas_no = False
while ind < n:
text = tags[ind].get_text(strip=True)
if text:
if any(kw in text for kw in KEYWORDS_5):
bisaya_before_kas_no = True
details["विषय"] = text
ind += 1
break
details["केस_नम्बर"] = text
break
ind += 1
if ind > n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Handle different case structures
if bisaya_before_kas_no:
case_no = []
appellant = []
opposition = []
temp_flag = True
while temp_flag and ind < n:
# Extract case number
while ind < n:
text = tags[ind].get_text(strip=True)
if text:
case_no.append(text)
ind += 1
break
ind += 1
# Extract appellant
while ind < n:
text = tags[ind].get_text(strip=True)
if any(kw in text for kw in KEYWORDS_3):
if any(kw2 == text for kw2 in KEYWORDS_3):
ind+=1
text = tags[ind].get_text(strip=True)
appellant.append(text)
ind += 1
break
ind += 1
# Extract opposition
while ind < n:
text = tags[ind].get_text(strip=True)
if any(kw in text for kw in KEYWORDS_4):
if any(kw2 == text for kw2 in KEYWORDS_4):
ind += 1
text = tags[ind].get_text(strip=True)
opposition.append(text)
ind += 1
break
ind += 1
# Check for end condition
temp_ind = ind
for tag in tags[temp_ind:]:
text = tag.get_text(strip=True)
if any(kw in text for kw in KEYWORDS_2):
temp_flag = False
details["केस_नम्बर"] = case_no
details["निवेदक"] = appellant
details["विपक्षी"] = opposition
break
elif any(kw == text for kw in KEYWORDS_9):
break
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
else:
# Standard case structure
while ind < n:
text = tags[ind].get_text(strip=True)
if any(kw in text for kw in KEYWORDS_5):
details["विषय"] = text
ind += 1
break
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
while ind < n:
text = tags[ind].get_text(strip=True)
if any(kw in text for kw in KEYWORDS_3):
if any(kw2 == text for kw2 in KEYWORDS_3):
ind+=1
text = tags[ind].get_text(strip=True)
details["निवेदक"] = text
ind += 1
break
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
while ind < n:
text = tags[ind].get_text(strip=True)
if any(kw in text for kw in KEYWORDS_4):
if any(kw2 == text for kw2 in KEYWORDS_4):
ind+=1
text = tags[ind].get_text(strip=True)
details["विपक्षी"] = text
ind += 1
break
ind += 1
if ind >= n:
ind = temp_ind_32
else:
temp_ind_32 = ind
# Clean up extracted text
# self.clean_extracted_details(details, bisaya_before_kas_no)
# Extract prakarans and tahar
prakarans = []
prev = ""
tahar = []
temp_flag_tahar = False
for tag in tags[ind:]:
text = tag.get_text(separator=' ', strip=True)
if text:
#if "§" in text or any(kw in text for kw in KEYWORDS_2):
#prakarans.append(text)
if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if any(text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 23: Only append if prev has content
prakarans.append(prev)
prakarans.append(text)
prev = ""
if "§" in text:
prakarans.append(text)
if "फैसला"==text or "आदेश"==text or "फैसलाः"==text:
if not prakarans:
prakarans.append(prev)
else:
prev = prev + " " + text if prev else text
if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(text)
# Process list items
next_sib = tag.find_next_sibling()
while next_sib and next_sib.name in ['ul', 'ol']:
for li in next_sib.find_all('li'):
li_text = li.get_text(separator=' ', strip=True)
if li_text:
if any(li_text.startswith(kw) for kw in KEYWORDS_2):
if prev: # IMPROVEMENT 24: Only append if prev has content
prakarans.append(prev)
prakarans.append(li_text)
prev = ""
else:
prev = prev + " " + li_text if prev else li_text
if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar:
temp_flag_tahar = True
tahar.append(li_text)
next_sib = next_sib.find_next_sibling()
details["प्रकरण"] = prakarans
details["ठहर"] = tahar
# Get HTML file path
html_file_path = ""
if mudda_type and sal:
filename = self.generate_html_filename(url, mudda_type, sal)
html_file_path = os.path.join(self.html_folder, filename)
# Combine all data, handling lists and strings appropriately
data = {
"लिङ्क": url,
"निर्णय नं.": decision_title,
"भाग": bhaag,
"मुद्दाको किसिम": mudda_type,
"साल": saal,
"महिना": mahina,
"अंक": anka,
"फैसला मिति": f"'{decision_date}'",
"अदालत / इजलास": details.get("अदालत / इजलास", "N/A"),
"न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False),
"आदेश मिति": details.get("आदेश मिति", "N/A"),
"केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"),
"विषय": details.get("विषय", "N/A"),
"निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"),
"विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"),
"प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False),
"ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False),
"html_file_path": html_file_path
}
# Save to SQLite
self.save_to_sqlite(data)
print(f"{url} - Successfully Scraped and Entered")
return True
except Exception as e:
print(f"Error scraping {url}: {e}")
return False
def save_to_sqlite(self, data):
"""Save data to SQLite database"""
cursor = self.conn.cursor()
try:
cursor.execute('''
INSERT OR REPLACE INTO cases (
लिङ्क, निर्णय_नं, भाग, मुद्दाको_किसिम, साल, महिना, अंक, फैसला_मिति,
अदालत_वा_इजलास, न्यायाधीश, आदेश_मिति, केस_नम्बर, विषय, निवेदक, विपक्षी,
प्रकरण, ठहर, html_file_path
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
data["लिङ्क"], data["निर्णय नं."], data["भाग"], data["मुद्दाको किसिम"],
data["साल"], data["महिना"], data["अंक"], data["फैसला मिति"],
data["अदालत / इजलास"], data["न्यायाधीश"], data["आदेश मिति"], data["केस_नम्बर"],
data["विषय"], data["निवेदक"], data["विपक्षी"], data["प्रकरण"], data["ठहर"],
data["html_file_path"]
))
self.conn.commit()
except sqlite3.Error as e:
print(f"Database error: {e}")
raise
def save_failed_links(self, failed_links, mudda_type, sal, error_msg="Unknown error"):
"""Save failed links to SQLite database"""
if failed_links:
cursor = self.conn.cursor()
for link in failed_links:
try:
cursor.execute('''
INSERT INTO failed_links (मुद्दाको_किसिम, साल, लिङ्क, error_message, retry_count)
VALUES (?, ?, ?, ?, ?)
''', (mudda_type, sal, link, error_msg, 1))
except sqlite3.Error as e:
print(f"Error saving failed link {link}: {e}")
try:
self.conn.commit()
except sqlite3.Error as e:
print(f"Error committing failed links: {e}")
def test_single_link(self, url, mudda_type=None, sal=None, use_saved=True):
"""Test scraping a single link"""
print(f"Testing single link: {url}")
# If mudda_type and sal not provided, try to extract from existing data or filename
if not mudda_type or not sal:
cursor = self.conn.cursor()
cursor.execute('SELECT मुद्दाको_किसिम, साल FROM cases WHERE लिङ्क = ?', (url,))
result = cursor.fetchone()
if result:
mudda_type, sal = result
print(f"Found existing data: mudda_type={mudda_type}, sal={sal}")
if not mudda_type or not sal:
print("Warning: mudda_type and sal not provided and couldn't be determined from existing data")
print("Using generic scraping without HTML file management")
success = self.scrape_case_details_generic(url, mudda_type, sal, use_saved)
if success:
print("✓ Successfully scraped and saved to database")
else:
print("✗ Failed to scrape")
return success
def test_saved_html_files(self, mudda_type=None, sal=None, limit=None):
"""Test scraping from saved HTML files"""
html_files = self.get_saved_html_files_by_criteria(mudda_type, sal)
if not html_files:
print("No saved HTML files found matching criteria")
return
print(f"Found {len(html_files)} saved HTML files")
if limit:
html_files = html_files[:limit]
print(f"Testing first {limit} files")
successful_count = 0
failed_count = 0
for html_file in html_files:
file_mudda_type, file_sal, link_number = self.extract_info_from_filename(html_file)
if not file_mudda_type or not file_sal:
print(f"Could not extract info from filename: {html_file}")
failed_count += 1
continue
# Reconstruct URL (this is a simplified approach)
url = f"https://nkp.gov.np/full_detail/{link_number}"
print(f"Testing {html_file} -> {file_mudda_type}, {file_sal}")
success = self.scrape_case_details_generic(url, file_mudda_type, file_sal, use_saved=True)
if success:
successful_count += 1
else:
failed_count += 1
print(f"\nTest Results:")
print(f"✓ Successful: {successful_count}")
print(f"✗ Failed: {failed_count}")
print(f"Total: {len(html_files)}")
def run_scraper(self, mudda_type, sal, use_saved=True):
"""Main method to run the scraper"""
print(f"Starting scraper for mudda_type: {mudda_type}, sal: {sal}")
print(f"Using database: {self.output_db}")
print(f"HTML folder: {self.html_folder}")
print(f"Use saved HTML files: {use_saved}")
# Validate inputs
if mudda_type not in self.mudda_type_arr:
raise ValueError(f"Invalid mudda_type. Must be one of: {self.mudda_type_arr}")
# Generate search URL
try:
search_url = self.search_url(mudda_type, sal)
print(f"Search URL: {search_url}")
except Exception as e:
print(f"Error generating search URL: {e}")
return
# Get all case URLs
print("Fetching all case URLs...")
case_urls = self.get_all_pages(search_url, mudda_type, sal, use_saved)
if not case_urls:
print("No case URLs found!")
return
print(f"Found {len(case_urls)} case URLs to scrape")
# Scrape each case
successful_count = 0
failed_links = []
for i, url in enumerate(case_urls, 1):
print(f"Processing {i}/{len(case_urls)}: {url}")
success = self.scrape_case_details_generic(url, mudda_type, sal, use_saved)
if success:
successful_count += 1
else:
failed_links.append(url)
# Add delay between requests only if downloading from web
if not use_saved:
time.sleep(2)
# Retry failed links once
if failed_links:
print(f"\nRetrying {len(failed_links)} failed links...")
still_failed = []
for i, url in enumerate(failed_links, 1):
print(f"Retrying {i}/{len(failed_links)}: {url}")
success = self.scrape_case_details_generic(url, mudda_type, sal, use_saved=False) # Force web download on retry
if success:
successful_count += 1
else:
still_failed.append(url)
time.sleep(2)
# Save permanently failed links
if still_failed:
self.save_failed_links(still_failed, mudda_type, sal, "Failed after retry")
print(f"\nFinal Results:")
print(f"Total links found: {len(case_urls)}")
print(f"Successfully scraped: {successful_count}")
print(f"Failed to scrape: {len(still_failed)}")
if still_failed:
print(f"Failed links saved to database: failed_links table")
else:
print(f"\nResults:")
print(f"Total links found: {len(case_urls)}")
print(f"Successfully scraped: {successful_count}")
print(f"Scraped data saved to SQLite database: {self.output_db}")
def close(self):
"""Explicitly close the database connection"""
if hasattr(self, 'conn'):
self.conn.close()
def __del__(self):
"""Close SQLite connection when the object is destroyed"""
self.close()
def create_parser():
"""Create command line argument parser"""
parser = argparse.ArgumentParser(
description="Legal Case Scraper for Nepal Kanoon Patrika",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Scrape specific mudda_type and year
python app.py --mudda_type "दुनियाबादी देवानी" --nepali_year "२०७३" --database_name "app_test_db.db"
# Test a specific link
python app.py --test_link "https://nkp.gov.np/8035" --mudda_type "दुनियाबादी देवानी" --nepali_year "२०७३"
# Test saved HTML files
python app.py --test_saved --nepali_year "२०७३" --limit 5
# Use saved HTML files for scraping (faster)
python app.py --mudda_type "दुनियाबादी देवानी" --nepali_year "२०७३" --use_saved
# List available mudda types
python app.py --list_mudda_types
"""
)
parser.add_argument('--mudda_type', type=str,
help='Mudda type (e.g., "दुनियाबादी देवानी")')
parser.add_argument('--nepali_year', type=str,
help='Nepali year (e.g., "२०७३")')
parser.add_argument('--database_name', type=str, default='legal_cases_2.db',
help='SQLite database filename (default: legal_cases_2.db)')
parser.add_argument('--html_folder', type=str, default='scraped_html',
help='Folder to store HTML files (default: scraped_html)')
parser.add_argument('--use_saved', action='store_true',
help='Use saved HTML files when available (faster)')
parser.add_argument('--test_link', type=str,
help='Test scraping a specific link')
parser.add_argument('--test_saved', action='store_true',
help='Test scraping from saved HTML files')
parser.add_argument('--limit', type=int,
help='Limit number of files to test (use with --test_saved)')
parser.add_argument('--list_mudda_types', action='store_true',
help='List all available mudda types')
return parser
def main():
"""Main function to run the application"""
parser = create_parser()
args = parser.parse_args()
# List mudda types if requested
if args.list_mudda_types:
temp_scraper = LegalCaseScraper()
print("Available mudda_type options:")
for i, option in enumerate(temp_scraper.mudda_type_arr, 1):
print(f"{i}. {option}")
temp_scraper.close()
return
# Create the scraper
scraper = LegalCaseScraper(
output_db=args.database_name,
html_folder=args.html_folder
)
try:
# Test single link
if args.test_link:
success = scraper.test_single_link(
args.test_link,
args.mudda_type,
args.nepali_year,
use_saved=args.use_saved
)
return
# Test saved HTML files
if args.test_saved:
scraper.test_saved_html_files(
mudda_type=args.mudda_type,
sal=args.nepali_year,
limit=args.limit
)
return
# Regular scraping
if not args.mudda_type or not args.nepali_year:
print("Error: --mudda_type and --nepali_year are required for scraping")
print("Use --help for usage examples")
return
scraper.run_scraper(
mudda_type=args.mudda_type,
sal=args.nepali_year,
use_saved=args.use_saved
)
except Exception as e:
print(f"Error: {e}")
finally:
scraper.close()
if __name__ == "__main__":
main()