Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import time | |
| import os | |
| import sys | |
| import argparse | |
| from urllib.parse import urlencode | |
| import sqlite3 | |
| import json | |
| import re | |
| from pathlib import Path | |
| import glob | |
| import nepali_datetime | |
| class LegalCaseScraper: | |
| def __init__(self, output_db="legal_cases_2.db", html_folder="scraped_html"): | |
| self.mudda_type_arr = [ | |
| "दुनियाबादी देवानी", | |
| "सरकारबादी देवानी", | |
| "दुनियावादी फौजदारी", | |
| "सरकारवादी फौजदारी", | |
| "रिट", | |
| "निवेदन", | |
| "विविध" | |
| ] | |
| self.successful_entries = 0 | |
| self.not_entered_links = [] | |
| self.still_not_entered_links = [] | |
| self.output_db = output_db | |
| self.html_folder = html_folder | |
| # Create HTML folder if it doesn't exist | |
| os.makedirs(self.html_folder, exist_ok=True) | |
| # Initialize SQLite database | |
| self.conn = sqlite3.connect(self.output_db) | |
| self.create_tables() | |
| def create_tables(self): | |
| """Create SQLite tables for scraped data and failed links""" | |
| cursor = self.conn.cursor() | |
| # Table for scraped case data | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS cases ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| लिङ्क TEXT UNIQUE, | |
| निर्णय_नं TEXT, | |
| भाग TEXT, | |
| मुद्दाको_किसिम TEXT, | |
| साल TEXT, | |
| महिना TEXT, | |
| अंक TEXT, | |
| फैसला_मिति TEXT, | |
| अदालत_वा_इजलास TEXT, | |
| न्यायाधीश TEXT, | |
| आदेश_मिति TEXT, | |
| केस_नम्बर TEXT, | |
| विषय TEXT, | |
| निवेदक TEXT, | |
| विपक्षी TEXT, | |
| प्रकरण TEXT, | |
| ठहर TEXT, | |
| html_file_path TEXT, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| ''') | |
| # Table for failed links | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS failed_links ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| मुद्दाको_किसिम TEXT, | |
| साल TEXT, | |
| लिङ्क TEXT, | |
| error_message TEXT, | |
| retry_count INTEGER DEFAULT 0, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| ''') | |
| self.conn.commit() | |
| def get_mudda_type_number(self, mudda_type): | |
| """Get mudda type number (1-7) from mudda type name""" | |
| try: | |
| return str(self.mudda_type_arr.index(mudda_type) + 1) | |
| except ValueError: | |
| raise ValueError(f"Invalid mudda_type: {mudda_type}. Must be one of {self.mudda_type_arr}") | |
| def extract_link_number(self, url): | |
| """Extract the number at the end of the URL""" | |
| match = re.search(r'/(\d+)/?$', url) | |
| return match.group(1) if match else "unknown" | |
| def generate_html_filename(self, url, mudda_type, sal): | |
| """Generate standardized HTML filename: mudda_number_year_link_number.html""" | |
| mudda_number = self.get_mudda_type_number(mudda_type) | |
| english_sal = self.nepali_sal_to_english_sal(sal) | |
| link_number = self.extract_link_number(url) | |
| return f"{mudda_number}_{english_sal}_{link_number}.html" | |
| def nepali_sal_to_english_sal(self, sal): | |
| """Convert Nepali numerals to English numerals""" | |
| if not sal: | |
| return "" | |
| nepali_to_english = { | |
| '०': '0', '१': '1', '२': '2', '३': '3', '४': '4', | |
| '५': '5', '६': '6', '७': '7', '८': '8', '९': '9' | |
| } | |
| try: | |
| return ''.join(nepali_to_english.get(char, char) for char in str(sal)) | |
| except (TypeError, AttributeError): | |
| raise ValueError(f"Input must be a string containing Nepali numerals, got: {type(sal)}") | |
| def search_url(self, mudda_type, sal): | |
| """Generate search URL based on mudda_type and sal""" | |
| mudda_types = {name: str(idx + 1) for idx, name in enumerate(self.mudda_type_arr)} | |
| if mudda_type not in mudda_types: | |
| raise ValueError(f"Invalid mudda_type: {mudda_type}. Must be one of {self.mudda_type_arr}") | |
| english_sal = self.nepali_sal_to_english_sal(sal) | |
| base_url = "https://nkp.gov.np/" | |
| params = { | |
| "mudda_number": "", | |
| "faisala_date_from": "", | |
| "faisala_date_to": "", | |
| "mudda_type": mudda_types[mudda_type], | |
| "mudda_name": "", | |
| "badi": "", | |
| "pratibadi": "", | |
| "judge": "", | |
| "ijlas_type": "", | |
| "nirnaya_number": "", | |
| "faisala_type": "", | |
| "keywords": "", | |
| "edition": "", | |
| "year": english_sal, | |
| "month": "", | |
| "volume": "", | |
| "Submit": "खोज्नुहोस्" | |
| } | |
| return f"{base_url}?{urlencode(params)}#" | |
| def save_html_file(self, url, html_content, mudda_type, sal): | |
| """Save HTML content to file with standardized naming""" | |
| filename = self.generate_html_filename(url, mudda_type, sal) | |
| filepath = os.path.join(self.html_folder, filename) | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write(html_content) | |
| return filepath | |
| def load_html_file(self, url, mudda_type, sal): | |
| """Load HTML content from existing file""" | |
| filename = self.generate_html_filename(url, mudda_type, sal) | |
| filepath = os.path.join(self.html_folder, filename) | |
| if os.path.exists(filepath): | |
| with open(filepath, "r", encoding="utf-8") as f: | |
| return f.read() | |
| return None | |
| def return_soup(self, url, mudda_type=None, sal=None, use_saved=True, max_retries=3): | |
| """Get soup object from URL or saved HTML file""" | |
| # Try to load from saved file first if requested | |
| if use_saved and mudda_type and sal: | |
| html_content = self.load_html_file(url, mudda_type, sal) | |
| if html_content: | |
| print(f"Using saved HTML file for {url}") | |
| return BeautifulSoup(html_content, 'html.parser') | |
| # Download from web if not found in saved files or use_saved is False | |
| for attempt in range(max_retries): | |
| try: | |
| r = requests.get(url, timeout=30, headers={ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| }) | |
| if r.status_code == 200: | |
| r.encoding = 'utf-8' | |
| # Save HTML file if mudda_type and sal are provided | |
| if mudda_type and sal: | |
| filepath = self.save_html_file(url, r.text, mudda_type, sal) | |
| print(f"Saved HTML to: {filepath}") | |
| return BeautifulSoup(r.text, 'html.parser') | |
| else: | |
| print(f"Attempt {attempt + 1}: Failed to retrieve {url}. Status code: {r.status_code}") | |
| if attempt < max_retries - 1: | |
| time.sleep(2 ** attempt) | |
| except requests.exceptions.RequestException as e: | |
| print(f"Attempt {attempt + 1}: Error scraping {url}: {e}") | |
| if attempt < max_retries - 1: | |
| time.sleep(2 ** attempt) | |
| return None | |
| def get_saved_html_files_by_criteria(self, mudda_type=None, sal=None): | |
| """Get list of saved HTML files matching criteria""" | |
| pattern = "*" | |
| if mudda_type and sal: | |
| mudda_number = self.get_mudda_type_number(mudda_type) | |
| english_sal = self.nepali_sal_to_english_sal(sal) | |
| pattern = f"{mudda_number}_{english_sal}_*.html" | |
| elif sal: | |
| english_sal = self.nepali_sal_to_english_sal(sal) | |
| pattern = f"*_{english_sal}_*.html" | |
| elif mudda_type: | |
| mudda_number = self.get_mudda_type_number(mudda_type) | |
| pattern = f"{mudda_number}_*_*.html" | |
| search_path = os.path.join(self.html_folder, pattern) | |
| return glob.glob(search_path) | |
| def extract_info_from_filename(self, filename): | |
| """Extract mudda_type, sal, and link_number from filename""" | |
| basename = os.path.basename(filename) | |
| match = re.match(r'(\d+)_(\d+)_(\d+)\.html', basename) | |
| if match: | |
| mudda_number, sal, link_number = match.groups() | |
| mudda_type = self.mudda_type_arr[int(mudda_number) - 1] | |
| return mudda_type, sal, link_number | |
| return None, None, None | |
| def from_each_page(self, links): | |
| """Extract unique case links from page links""" | |
| li = [] | |
| flag = False | |
| i = 0 | |
| while(i < len(links)): | |
| href = links[i].get('href') | |
| if href and "#" in href: | |
| i+=1 | |
| if i < len(links): | |
| temp_href = links[i].get('href') | |
| if temp_href: | |
| li.append(temp_href) | |
| else: | |
| i+=1 | |
| unique_list = [] | |
| if(len(li) > 1): | |
| unique_list = list(dict.fromkeys(li)) | |
| return unique_list | |
| def get_all_pages(self, initial_url, mudda_type=None, sal=None, use_saved=True): | |
| """Get all page URLs for pagination""" | |
| soup = self.return_soup(initial_url, mudda_type, sal, use_saved) | |
| if not soup: | |
| return [] | |
| links = soup.find_all('a') | |
| all_links = [] | |
| other_pages = [] | |
| for link in links: | |
| href = link.get('href') | |
| if href: | |
| all_links.append(href) | |
| if "https://nkp.gov.np/advance_search/" in href: | |
| other_pages.append(href) | |
| unique_list = self.from_each_page(links) | |
| # Handle pagination | |
| if "javascript:void(0)" in all_links and other_pages: | |
| mx = 0 | |
| for j in other_pages: | |
| temp = "" | |
| for i in range(len(j)-1, -1, -1): | |
| if j[i] == "=": | |
| break | |
| temp = j[i] + temp | |
| try: | |
| temp2 = int(temp) | |
| if mx < temp2: | |
| mx = temp2 | |
| except ValueError: | |
| continue | |
| if mx > 0: | |
| st = other_pages[0][:-2] | |
| real_other_pages = [] | |
| for i in range(20, mx + 1, 20): | |
| real_other_pages.append(st + str(i)) | |
| unique_list2 = [] | |
| for page_url in real_other_pages: | |
| print(f"Processing page: {page_url}") | |
| try: | |
| page_soup = self.return_soup(page_url, mudda_type, sal, use_saved) | |
| if page_soup: | |
| page_links = page_soup.find_all('a') | |
| unique_list2 += self.from_each_page(page_links) | |
| except Exception as e: | |
| print(f"Error scraping page {page_url}: {e}") | |
| unique_list += unique_list2 | |
| # Remove duplicates | |
| unique_unique_list = list(dict.fromkeys(unique_list)) | |
| return unique_unique_list | |
| def get_edition_field(self, soup, label): | |
| """Extract edition field from soup""" | |
| edition_info = soup.find("div", id="edition-info") | |
| if edition_info: | |
| for span in edition_info.find_all("span"): | |
| if label in span.text: | |
| strong = span.find("strong") | |
| return strong.text.strip() if strong else None | |
| return None | |
| def determine_scraper_method(self, sal): | |
| """Determine which scraper method to use based on year""" | |
| eng_sal = int(self.nepali_sal_to_english_sal(sal)) | |
| today = nepali_datetime.date.today() | |
| latest_nepali_year = int(today.year) | |
| if 2015 <= eng_sal <= 2044: | |
| return self.scrape_case_details_2015_to_2044 | |
| elif 2045 <= eng_sal <= 2050: | |
| return self.scrape_case_details_2045_to_2050 | |
| elif 2051 <= eng_sal <= 2061: | |
| return self.scrape_case_details_2051_to_2061 | |
| elif 2062 <= eng_sal <= 2072: | |
| return self.scrape_case_details_2062_to_2072 | |
| elif 2073 <= eng_sal < latest_nepali_year: | |
| return self.scrape_case_details_2073_to_2080_and_beyond | |
| else: | |
| raise ValueError(f"No scraper method available for year {eng_sal} or those records not yet available in Nepal Kanun Patrika Website") | |
| def scrape_case_details_generic(self, url, mudda_type, sal, use_saved=True): | |
| """Generic method that routes to the appropriate scraper based on year""" | |
| try: | |
| scraper_method = self.determine_scraper_method(sal) | |
| return scraper_method(url, mudda_type, sal, use_saved) | |
| except ValueError as e: | |
| print(f"Error: {e}") | |
| return False | |
| # [Previous scraper methods with modifications for HTML file handling] | |
| def scrape_case_details_2015_to_2044(self, url, mudda_type, sal=None, use_saved=True): | |
| """Scrape details from a single case URL (2015-2044)""" | |
| try: | |
| cursor = self.conn.cursor() | |
| cursor.execute('SELECT लिङ्क FROM cases WHERE लिङ्क = ?', (url,)) | |
| if cursor.fetchone(): | |
| print(f"URL {url} already exists in database, skipping...") | |
| return True | |
| # Get soup using saved HTML or web | |
| soup = self.return_soup(url, mudda_type, sal, use_saved) | |
| if not soup: | |
| print(f"Failed to get content for {url}") | |
| return False | |
| # Extract basic information | |
| title_tag = soup.find("h1", class_="post-title") | |
| decision_title = title_tag.get_text(strip=True).split()[2] if title_tag and len(title_tag.get_text(strip=True).split()) > 2 else "N/A" | |
| bhaag = self.get_edition_field(soup, "भाग") | |
| saal = self.get_edition_field(soup, "साल") | |
| mahina = self.get_edition_field(soup, "महिना") | |
| anka = self.get_edition_field(soup, "अंक") | |
| # Extract decision date | |
| post_meta = soup.find("div", class_="post-meta") | |
| decision_date = "N/A" | |
| if post_meta and "फैसला मिति" in post_meta.text: | |
| try: | |
| decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0] | |
| except IndexError: | |
| decision_date = "N/A" | |
| # Extract detailed information | |
| div_tag = soup.find("div", id="faisala_detail ") | |
| details = {} | |
| if div_tag: | |
| tags = div_tag.find_all(['h1', 'p']) | |
| n = len(tags) | |
| ind = 0 | |
| temp_ind_32 = ind | |
| KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."] | |
| KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"] | |
| KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"] | |
| KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"] | |
| KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"] | |
| KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"] | |
| KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"] | |
| KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"] | |
| KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"] | |
| # Extract court information | |
| temp_ijlash = "" | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if any(kw == text for kw in KEYWORDS_6): | |
| if "निर्णय नं." not in temp_ijlash: | |
| details["इजलास"] = temp_ijlash | |
| ind+=1 | |
| break | |
| elif any(kw in text for kw in KEYWORDS_6): | |
| details["इजलास"] = text | |
| ind+=1 | |
| text_2 = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text_2 for kw in KEYWORDS_8) == False: | |
| details["इजलास"] = text +" "+ text_2 | |
| ind+=1 | |
| break | |
| elif any(kw in text for kw in KEYWORDS_8): | |
| if "निर्णय नं." not in temp_ijlash: | |
| details["इजलास"] = temp_ijlash | |
| break | |
| temp_ijlash = text | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract judges | |
| judges = [] | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if any(kw in text for kw in KEYWORDS_8): | |
| judges.append(text) | |
| else: | |
| details["न्यायाधीश"] = judges | |
| if any(kw in text for kw in KEYWORDS_10): | |
| details["केस_नम्बर"] = text | |
| ind+=1 | |
| elif any(kw2 in text for kw2 in KEYWORDS_3) == False and any(kw2 in text for kw2 in KEYWORDS_5) == False: | |
| details["केस_नम्बर"] = text | |
| ind+=1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Standard case structure | |
| bisaya_before_niweduck = False | |
| temp_ind_64 = ind | |
| while temp_ind_64 < n: | |
| text = tags[temp_ind_64].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_3) or any(kw in text for kw in KEYWORDS_4): | |
| break | |
| if any(kw in text for kw in KEYWORDS_5): | |
| bisaya_before_niweduck = True | |
| break | |
| temp_ind_64+=1 | |
| if bisaya_before_niweduck: | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(text.startswith(kw) for kw in KEYWORDS_5): | |
| details["विषय"] = text | |
| ind+=1 | |
| break | |
| if any(kw in text for kw in KEYWORDS_3): | |
| ind = temp_ind_32 | |
| break | |
| ind+=1 | |
| else: | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text): | |
| details["आदेश मिति"] = text | |
| ind+=1 | |
| break | |
| if any(kw in text for kw in KEYWORDS_3): | |
| ind = temp_ind_32 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_3): | |
| if any(kw2 == text for kw2 in KEYWORDS_3): | |
| ind += 1 | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| details["निवेदक"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_4): | |
| if any(kw2 == text for kw2 in KEYWORDS_4): | |
| ind += 1 | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| details["विपक्षी"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| if bisaya_before_niweduck==False: | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(text.startswith(kw) for kw in KEYWORDS_5): | |
| details["विषय"] = text | |
| ind+=1 | |
| break | |
| if any(kw in text for kw in KEYWORDS_2): | |
| ind = temp_ind_32 | |
| break | |
| ind+=1 | |
| else: | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text): | |
| details["आदेश मिति"] = text | |
| ind+=1 | |
| break | |
| if any(kw in text for kw in KEYWORDS_2): | |
| ind = temp_ind_32 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract prakarans and tahar | |
| prakarans = [] | |
| prev = "" | |
| tahar = [] | |
| temp_flag_tahar = False | |
| for tag in tags[ind:]: | |
| text = tag.get_text(separator=' ', strip=True) | |
| if text: | |
| #if "§" in text or any(kw in text for kw in KEYWORDS_2): | |
| #prakarans.append(text) | |
| if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if any(text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 23: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(text) | |
| prev = "" | |
| if "§" in text: | |
| prakarans.append(text) | |
| if "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if not prakarans: | |
| prakarans.append(prev) | |
| else: | |
| prev = prev + " " + text if prev else text | |
| if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(text) | |
| # Process list items | |
| next_sib = tag.find_next_sibling() | |
| while next_sib and next_sib.name in ['ul', 'ol']: | |
| for li in next_sib.find_all('li'): | |
| li_text = li.get_text(separator=' ', strip=True) | |
| if li_text: | |
| if any(li_text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 24: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(li_text) | |
| prev = "" | |
| else: | |
| prev = prev + " " + li_text if prev else li_text | |
| if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(li_text) | |
| next_sib = next_sib.find_next_sibling() | |
| details["प्रकरण"] = prakarans | |
| details["ठहर"] = tahar | |
| # Get HTML file path | |
| html_file_path = "" | |
| if mudda_type and sal: | |
| filename = self.generate_html_filename(url, mudda_type, sal) | |
| html_file_path = os.path.join(self.html_folder, filename) | |
| # Combine all data | |
| data = { | |
| "लिङ्क": url, | |
| "निर्णय नं.": decision_title, | |
| "भाग": bhaag or "N/A", | |
| "मुद्दाको किसिम": mudda_type, | |
| "साल": saal or "N/A", | |
| "महिना": mahina or "N/A", | |
| "अंक": anka or "N/A", | |
| "फैसला मिति": f"'{decision_date}'", | |
| "अदालत / इजलास": details.get("इजलास", "N/A"), | |
| "न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False), | |
| "आदेश मिति": details.get("आदेश मिति", "N/A"), | |
| "केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"), | |
| "विषय": details.get("विषय", "N/A"), | |
| "निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"), | |
| "विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"), | |
| "प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False), | |
| "ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False), | |
| "html_file_path": html_file_path | |
| } | |
| # Save to SQLite | |
| self.save_to_sqlite(data) | |
| print(f"{url} - Successfully Scraped and Entered") | |
| return True | |
| except Exception as e: | |
| print(f"Error scraping {url}: {e}") | |
| return False | |
| def scrape_case_details_2045_to_2050(self, url, mudda_type, sal = None, use_saved=True): # CHANGE 4: Remove output_db parameter | |
| """Scrape details from a single case URL""" | |
| try: | |
| # IMPROVEMENT 16: Check if URL already exists in database | |
| cursor = self.conn.cursor() | |
| cursor.execute('SELECT लिङ्क FROM cases WHERE लिङ्क = ?', (url,)) | |
| if cursor.fetchone(): | |
| print(f"URL {url} already exists in database, skipping...") | |
| return True | |
| # Get soup using saved HTML or web | |
| soup = self.return_soup(url, mudda_type, sal, use_saved) | |
| if not soup: | |
| print(f"Failed to get content for {url}") | |
| return False | |
| # Extract basic information | |
| title_tag = soup.find("h1", class_="post-title") | |
| decision_title = title_tag.get_text(strip=True).split()[2] if title_tag and len(title_tag.get_text(strip=True).split()) > 2 else "N/A" # IMPROVEMENT 19: Bounds checking | |
| bhaag = self.get_edition_field(soup, "भाग") | |
| saal = self.get_edition_field(soup, "साल") | |
| mahina = self.get_edition_field(soup, "महिना") | |
| anka = self.get_edition_field(soup, "अंक") | |
| # Extract decision date | |
| post_meta = soup.find("div", class_="post-meta") | |
| decision_date = "N/A" | |
| if post_meta and "फैसला मिति" in post_meta.text: | |
| try: # IMPROVEMENT 20: Better error handling for date extraction | |
| decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0] | |
| except IndexError: | |
| decision_date = "N/A" | |
| # Extract detailed information | |
| div_tag = soup.find("div", id="faisala_detail ") | |
| details = {} | |
| if div_tag: | |
| tags = div_tag.find_all(['h1', 'p']) | |
| n = len(tags) | |
| ind = 0 | |
| temp_ind_32 = ind | |
| KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."] | |
| KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"] | |
| KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"] | |
| KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"] | |
| KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"] | |
| KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"] | |
| KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"] | |
| KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"] | |
| KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"] | |
| # Extract court information | |
| temp_ijlash = "" | |
| while(ind < n): | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if any(kw == text for kw in KEYWORDS_6): | |
| details["इजलास"] = temp_ijlash | |
| ind+=1 | |
| break | |
| elif any(kw in text for kw in KEYWORDS_6): | |
| details["इजलास"] = text | |
| ind+=1 | |
| break | |
| elif "न्यायाधीश" in text or "माननीय" in text: | |
| details["इजलास"] = temp_ijlash | |
| break | |
| temp_ijlash = text | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract judges | |
| judges = [] | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if "न्यायाधीश" in text or "माननीय" in text: | |
| judges.append(text) | |
| elif any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text): | |
| details["न्यायाधीश"] = judges | |
| details["आदेश मिति"] = text | |
| ind += 1 | |
| break | |
| else: | |
| details["केस_नम्बर"] = text | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Standard case structure | |
| bisaya_before_niweduck = False | |
| details["विषय"] = "" | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_3) or any(kw in text for kw in KEYWORDS_4): | |
| break | |
| if any(kw in text for kw in KEYWORDS_5): | |
| bisaya_before_niweduck = True | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| if bisaya_before_niweduck: | |
| while ind < n: | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_5): | |
| details["विषय"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| #temp_Ind = ind | |
| while ind < n: | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_3): | |
| if any(kw2 == text for kw2 in KEYWORDS_3): | |
| ind += 1 | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| details["निवेदक"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| while ind < n: | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_4): | |
| if any(kw2 == text for kw2 in KEYWORDS_4): | |
| ind += 1 | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| details["विपक्षी"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| if bisaya_before_niweduck==False: | |
| while ind < n: | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_5): | |
| details["विषय"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract prakarans and tahar | |
| prakarans = [] | |
| prev = "" | |
| tahar = [] | |
| temp_flag_tahar = False | |
| for tag in tags[ind:]: | |
| text = tag.get_text(separator=' ', strip=True) | |
| if text: | |
| #if "§" in text or any(kw in text for kw in KEYWORDS_2): | |
| #prakarans.append(text) | |
| if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if any(text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 23: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(text) | |
| prev = "" | |
| if "§" in text: | |
| prakarans.append(text) | |
| if "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if not prakarans: | |
| prakarans.append(prev) | |
| else: | |
| prev = prev + " " + text if prev else text | |
| if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(text) | |
| # Process list items | |
| next_sib = tag.find_next_sibling() | |
| while next_sib and next_sib.name in ['ul', 'ol']: | |
| for li in next_sib.find_all('li'): | |
| li_text = li.get_text(separator=' ', strip=True) | |
| if li_text: | |
| if any(li_text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 24: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(li_text) | |
| prev = "" | |
| else: | |
| prev = prev + " " + li_text if prev else li_text | |
| if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(li_text) | |
| next_sib = next_sib.find_next_sibling() | |
| details["प्रकरण"] = prakarans | |
| details["ठहर"] = tahar | |
| # Get HTML file path | |
| html_file_path = "" | |
| if mudda_type and sal: | |
| filename = self.generate_html_filename(url, mudda_type, sal) | |
| html_file_path = os.path.join(self.html_folder, filename) | |
| # Combine all data, handling lists and strings appropriately | |
| data = { | |
| "लिङ्क": url, | |
| "निर्णय नं.": decision_title, | |
| "भाग": bhaag or "N/A", # IMPROVEMENT 25: Handle None values | |
| "मुद्दाको किसिम": mudda_type, | |
| "साल": saal or "N/A", | |
| "महिना": mahina or "N/A", | |
| "अंक": anka or "N/A", | |
| "फैसला मिति": f"'{decision_date}'", | |
| "अदालत / इजलास": details.get("इजलास", "N/A"), | |
| "न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False), | |
| "आदेश मिति": details.get("आदेश मिति", "N/A"), | |
| "केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"), | |
| "विषय": details.get("विषय", "N/A"), | |
| "निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"), | |
| "विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"), | |
| "प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False), | |
| "ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False), | |
| "html_file_path": html_file_path | |
| } | |
| # Save to SQLite | |
| self.save_to_sqlite(data) | |
| print(f"{url} - Successfully Scraped and Entered") | |
| return True | |
| except Exception as e: | |
| print(f"Error scraping {url}: {e}") | |
| return False | |
| def scrape_case_details_2051_to_2061(self, url, mudda_type, sal = None, use_saved=True): # CHANGE 4: Remove output_db parameter | |
| """Scrape details from a single case URL""" | |
| try: | |
| # IMPROVEMENT 16: Check if URL already exists in database | |
| cursor = self.conn.cursor() | |
| cursor.execute('SELECT लिङ्क FROM cases WHERE लिङ्क = ?', (url,)) | |
| if cursor.fetchone(): | |
| print(f"URL {url} already exists in database, skipping...") | |
| return True | |
| # Get soup using saved HTML or web | |
| soup = self.return_soup(url, mudda_type, sal, use_saved) | |
| if not soup: | |
| print(f"Failed to get content for {url}") | |
| return False | |
| # Extract basic information | |
| title_tag = soup.find("h1", class_="post-title") | |
| decision_title = title_tag.get_text(strip=True).split()[2] if title_tag and len(title_tag.get_text(strip=True).split()) > 2 else "N/A" # IMPROVEMENT 19: Bounds checking | |
| bhaag = self.get_edition_field(soup, "भाग") | |
| saal = self.get_edition_field(soup, "साल") | |
| mahina = self.get_edition_field(soup, "महिना") | |
| anka = self.get_edition_field(soup, "अंक") | |
| # Extract decision date | |
| post_meta = soup.find("div", class_="post-meta") | |
| decision_date = "N/A" | |
| if post_meta and "फैसला मिति" in post_meta.text: | |
| try: # IMPROVEMENT 20: Better error handling for date extraction | |
| decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0] | |
| except IndexError: | |
| decision_date = "N/A" | |
| # Extract detailed information | |
| div_tag = soup.find("div", id="faisala_detail ") | |
| details = {} | |
| if div_tag: | |
| tags = div_tag.find_all(['h1', 'p']) | |
| n = len(tags) | |
| ind = 0 | |
| temp_ind_32 = ind | |
| KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."] | |
| KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"] | |
| KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"] | |
| KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"] | |
| KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"] | |
| KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"] | |
| KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"] | |
| KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"] | |
| KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"] | |
| # Extract court information | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text and ("इजलास" in text or "इजालास" in text): | |
| details["इजलास"] = text | |
| ind += 1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract judges | |
| judges = [] | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if "न्यायाधीश" in text or "माननीय" in text: | |
| judges.append(text) | |
| elif any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text): | |
| details["न्यायाधीश"] = judges | |
| details["आदेश मिति"] = text | |
| ind += 1 | |
| break | |
| else: | |
| details["केस_नम्बर"] = text | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Standard case structure | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if "विषय" in text or "मुद्दा" in text or "बिषय" in text or "मूद्दाः" in text: | |
| details["विषय"] = text | |
| ind += 1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_3): | |
| if any(kw2 == text for kw2 in KEYWORDS_3): | |
| ind += 1 | |
| if ind < n: # IMPROVEMENT 21: Bounds checking | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| details["निवेदक"] = text | |
| ind += 1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_4): | |
| if any(kw2 == text for kw2 in KEYWORDS_4): | |
| ind += 1 | |
| if ind < n: # IMPROVEMENT 22: Bounds checking | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| details["विपक्षी"] = text | |
| ind += 1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract prakarans and tahar | |
| prakarans = [] | |
| prev = "" | |
| tahar = [] | |
| temp_flag_tahar = False | |
| for tag in tags[ind:]: | |
| text = tag.get_text(separator=' ', strip=True) | |
| if text: | |
| #if "§" in text or any(kw in text for kw in KEYWORDS_2): | |
| #prakarans.append(text) | |
| if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if any(text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 23: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(text) | |
| prev = "" | |
| if "§" in text: | |
| prakarans.append(text) | |
| if "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if not prakarans: | |
| prakarans.append(prev) | |
| else: | |
| prev = prev + " " + text if prev else text | |
| if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(text) | |
| # Process list items | |
| next_sib = tag.find_next_sibling() | |
| while next_sib and next_sib.name in ['ul', 'ol']: | |
| for li in next_sib.find_all('li'): | |
| li_text = li.get_text(separator=' ', strip=True) | |
| if li_text: | |
| if any(li_text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 24: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(li_text) | |
| prev = "" | |
| else: | |
| prev = prev + " " + li_text if prev else li_text | |
| if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(li_text) | |
| next_sib = next_sib.find_next_sibling() | |
| details["प्रकरण"] = prakarans | |
| details["ठहर"] = tahar | |
| # Get HTML file path | |
| html_file_path = "" | |
| if mudda_type and sal: | |
| filename = self.generate_html_filename(url, mudda_type, sal) | |
| html_file_path = os.path.join(self.html_folder, filename) | |
| # Combine all data, handling lists and strings appropriately | |
| data = { | |
| "लिङ्क": url, | |
| "निर्णय नं.": decision_title, | |
| "भाग": bhaag or "N/A", # IMPROVEMENT 25: Handle None values | |
| "मुद्दाको किसिम": mudda_type, | |
| "साल": saal or "N/A", | |
| "महिना": mahina or "N/A", | |
| "अंक": anka or "N/A", | |
| "फैसला मिति": f"'{decision_date}'", | |
| "अदालत / इजलास": details.get("इजलास", "N/A"), | |
| "न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False), | |
| "आदेश मिति": details.get("आदेश मिति", "N/A"), | |
| "केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"), | |
| "विषय": details.get("विषय", "N/A"), | |
| "निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"), | |
| "विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"), | |
| "प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False), | |
| "ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False), | |
| "html_file_path": html_file_path | |
| } | |
| # Save to SQLite | |
| self.save_to_sqlite(data) | |
| print(f"{url} - Successfully Scraped and Entered") | |
| return True | |
| except Exception as e: | |
| print(f"Error scraping {url}: {e}") | |
| return False | |
| def scrape_case_details_2062_to_2072(self, url, mudda_type, sal = None, use_saved=True): # CHANGE 4: Remove output_db parameter | |
| """Scrape details from a single case URL""" | |
| try: | |
| # IMPROVEMENT 16: Check if URL already exists in database | |
| cursor = self.conn.cursor() | |
| cursor.execute('SELECT लिङ्क FROM cases WHERE लिङ्क = ?', (url,)) | |
| if cursor.fetchone(): | |
| print(f"URL {url} already exists in database, skipping...") | |
| return True | |
| # Get soup using saved HTML or web | |
| soup = self.return_soup(url, mudda_type, sal, use_saved) | |
| if not soup: | |
| print(f"Failed to get content for {url}") | |
| return False | |
| # Extract basic information | |
| title_tag = soup.find("h1", class_="post-title") | |
| decision_title = title_tag.get_text(strip=True).split()[2] if title_tag and len(title_tag.get_text(strip=True).split()) > 2 else "N/A" # IMPROVEMENT 19: Bounds checking | |
| bhaag = self.get_edition_field(soup, "भाग") | |
| saal = self.get_edition_field(soup, "साल") | |
| mahina = self.get_edition_field(soup, "महिना") | |
| anka = self.get_edition_field(soup, "अंक") | |
| # Extract decision date | |
| post_meta = soup.find("div", class_="post-meta") | |
| decision_date = "N/A" | |
| if post_meta and "फैसला मिति" in post_meta.text: | |
| try: # IMPROVEMENT 20: Better error handling for date extraction | |
| decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0] | |
| except IndexError: | |
| decision_date = "N/A" | |
| # Extract detailed information | |
| div_tag = soup.find("div", id="faisala_detail ") | |
| details = {} | |
| if div_tag: | |
| tags = div_tag.find_all(['h1', 'p']) | |
| n = len(tags) | |
| ind = 0 | |
| temp_ind_32 = ind | |
| KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."] | |
| KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"] | |
| KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"] | |
| KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"] | |
| KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"] | |
| KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"] | |
| KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"] | |
| KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"] | |
| KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"] | |
| # Extract court information | |
| temp_ijlash = "" | |
| while(ind < n): | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if any(kw == text for kw in KEYWORDS_6): | |
| if "निर्णय नं." not in temp_ijlash: | |
| details["अदालत / इजलास"] = temp_ijlash | |
| ind+=1 | |
| break | |
| elif any(kw in text for kw in KEYWORDS_6): | |
| details["अदालत / इजलास"] = text | |
| ind+=1 | |
| text_2 = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text_2 for kw in KEYWORDS_8) == False: | |
| details["अदालत / इजलास"] = text +" "+ text_2 | |
| ind+=1 | |
| break | |
| elif any(kw in text for kw in KEYWORDS_8): | |
| if "निर्णय नं." not in temp_ijlash: | |
| details["अदालत / इजलास"] = temp_ijlash | |
| break | |
| temp_ijlash = text | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract judges | |
| judges = [] | |
| faisla_miti_before_case_no = False | |
| subject_before_case_no = False | |
| while(ind < n): | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if any(kw in text for kw in KEYWORDS_8): | |
| judges.append(text) | |
| else: | |
| details["न्यायाधीश"] = judges | |
| if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text): | |
| details["आदेश मिति"] = text | |
| ind+=1 | |
| faisla_miti_before_case_no = True | |
| elif any(kw in text for kw in KEYWORDS_10): | |
| details["केस_नम्बर"] = text | |
| elif any(kw2 in text for kw2 in KEYWORDS_3) == False and any(kw2 in text for kw2 in KEYWORDS_5) == False: | |
| if text!="फैसला": | |
| details["केस_नम्बर"] = text | |
| else: | |
| ind+=1 | |
| details["केस_नम्बर"] = tags[ind].get_text(separator=' ', strip=True) | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Standard case structure | |
| if faisla_miti_before_case_no: | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if any(kw in text for kw in KEYWORDS_10): | |
| details["केस_नम्बर"] = text | |
| elif any(text.startswith(kw) for kw in KEYWORDS_5): | |
| subject_before_case_no = True | |
| details["विषय"] = text | |
| else: | |
| details["केस_नम्बर"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| else: | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text): | |
| details["आदेश मिति"] = text | |
| ind+=1 | |
| break | |
| if any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| ind = temp_ind_32 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| if subject_before_case_no: | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| details["केस_नम्बर"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| else: | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if text: | |
| if any(text.startswith(kw) for kw in KEYWORDS_5): | |
| details["विषय"] = text | |
| ind+=1 | |
| break | |
| if any(kw in text for kw in KEYWORDS_3): | |
| ind = temp_ind_32 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| temp_ind_64 = ind | |
| count_how_many = 0 | |
| while temp_ind_64 < n: | |
| text = tags[temp_ind_64].get_text(separator=' ', strip=True) | |
| if text and any(kw == text for kw in KEYWORDS_9): | |
| count_how_many += 1 | |
| if any(text.startswith(kw) for kw in KEYWORDS_2): | |
| break | |
| temp_ind_64+=1 | |
| if count_how_many > 1: | |
| case_no = [] | |
| appellant = [] | |
| opposition = [] | |
| while count_how_many > 0: | |
| while ind < n: | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_3): | |
| if any(kw2 == text for kw2 in KEYWORDS_3): | |
| ind += 1 | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| appellant.append(text) | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| while ind < n: | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_4): | |
| if any(kw2 == text for kw2 in KEYWORDS_4): | |
| ind += 1 | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| opposition.append(text) | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| count_how_many-=1 | |
| temp_ind_128 = 0 | |
| while temp_ind_128 < n: | |
| text = tags[temp_ind_128].get_text(separator=' ', strip=True) | |
| if text: | |
| if any(kw in text for kw in KEYWORDS_10): | |
| case_no.append(text) | |
| if any(text.startswith(kw) for kw in KEYWORDS_2) or any(kw == text for kw in KEYWORDS_7): | |
| break | |
| temp_ind_128 += 1 | |
| details["केस_नम्बर"] = case_no | |
| details["निवेदक"] = appellant | |
| details["विपक्षी"] = opposition | |
| else: | |
| while ind < n: | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_3): | |
| if any(kw2 == text for kw2 in KEYWORDS_3): | |
| ind += 1 | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| details["निवेदक"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| while ind < n: | |
| #text = p_tags[ind].get_text(strip=True) | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| if any(kw in text for kw in KEYWORDS_4): | |
| if any(kw2 == text for kw2 in KEYWORDS_4): | |
| ind += 1 | |
| text = tags[ind].get_text(separator=' ', strip=True) | |
| details["विपक्षी"] = text | |
| ind+=1 | |
| break | |
| ind+=1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract prakarans and tahar | |
| prakarans = [] | |
| prev = "" | |
| tahar = [] | |
| temp_flag_tahar = False | |
| for tag in tags[ind:]: | |
| text = tag.get_text(separator=' ', strip=True) | |
| if text: | |
| #if "§" in text or any(kw in text for kw in KEYWORDS_2): | |
| #prakarans.append(text) | |
| if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if any(text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 23: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(text) | |
| prev = "" | |
| if "§" in text: | |
| prakarans.append(text) | |
| if "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if not prakarans: | |
| prakarans.append(prev) | |
| else: | |
| prev = prev + " " + text if prev else text | |
| if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(text) | |
| # Process list items | |
| next_sib = tag.find_next_sibling() | |
| while next_sib and next_sib.name in ['ul', 'ol']: | |
| for li in next_sib.find_all('li'): | |
| li_text = li.get_text(separator=' ', strip=True) | |
| if li_text: | |
| if any(li_text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 24: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(li_text) | |
| prev = "" | |
| else: | |
| prev = prev + " " + li_text if prev else li_text | |
| if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(li_text) | |
| next_sib = next_sib.find_next_sibling() | |
| details["प्रकरण"] = prakarans | |
| details["ठहर"] = tahar | |
| # Get HTML file path | |
| html_file_path = "" | |
| if mudda_type and sal: | |
| filename = self.generate_html_filename(url, mudda_type, sal) | |
| html_file_path = os.path.join(self.html_folder, filename) | |
| # Combine all data, handling lists and strings appropriately | |
| data = { | |
| "लिङ्क": url, | |
| "निर्णय नं.": decision_title, | |
| "भाग": bhaag or "N/A", # IMPROVEMENT 25: Handle None values | |
| "मुद्दाको किसिम": mudda_type, | |
| "साल": saal or "N/A", | |
| "महिना": mahina or "N/A", | |
| "अंक": anka or "N/A", | |
| "फैसला मिति": f"'{decision_date}'", | |
| "अदालत / इजलास": details.get("अदालत / इजलास", "N/A"), | |
| "न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False), | |
| "आदेश मिति": details.get("आदेश मिति", "N/A"), | |
| "केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"), | |
| "विषय": details.get("विषय", "N/A"), | |
| "निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"), | |
| "विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"), | |
| "प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False), | |
| "ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False), | |
| "html_file_path": html_file_path | |
| } | |
| # Save to SQLite | |
| self.save_to_sqlite(data) | |
| print(f"{url} - Successfully Scraped and Entered") | |
| return True | |
| except Exception as e: | |
| print(f"Error scraping {url}: {e}") | |
| return False | |
| def scrape_case_details_2073_to_2080_and_beyond(self, url, mudda_type, sal = None, use_saved=True): | |
| """Scrape details from a single case URL""" | |
| try: | |
| r = requests.get(url, timeout=15) | |
| if r.status_code != 200: | |
| print(f"Failed to retrieve {url}, Status code: {r.status_code}") | |
| return False | |
| # Get soup using saved HTML or web | |
| soup = self.return_soup(url, mudda_type, sal, use_saved) | |
| if not soup: | |
| print(f"Failed to get content for {url}") | |
| return False | |
| # Extract basic information | |
| title_tag = soup.find("h1", class_="post-title") | |
| decision_title = title_tag.get_text(strip=True).split()[2] if title_tag else "N/A" | |
| bhaag = self.get_edition_field(soup, "भाग") | |
| saal = self.get_edition_field(soup, "साल") | |
| mahina = self.get_edition_field(soup, "महिना") | |
| anka = self.get_edition_field(soup, "अंक") | |
| # Extract decision date | |
| post_meta = soup.find("div", class_="post-meta") | |
| decision_date = "N/A" | |
| if post_meta and "फैसला मिति" in post_meta.text: | |
| decision_date = post_meta.text.strip().split("फैसला मिति :")[-1].split("\n")[0].strip().split()[0] | |
| # Extract detailed information | |
| div_tag = soup.find("div", id="faisala_detail ") | |
| details = {} | |
| if div_tag: | |
| tags = div_tag.find_all(['h1', 'p']) | |
| n = len(tags) | |
| ind = 0 | |
| temp_ind_32 = ind | |
| #KEYWORDS_2 = ["(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र.नं."] | |
| KEYWORDS_2 = ["प्रकरण नं.", "(प्रकरण नं", "(प्रकारण नं.", "९प्रकरण नं।", "(प्रकरण", "(प्र नं.","( प्र. नं","(प्र.नं", "(प्र. नं", "( प्रकरण नं.", "( प्रकरणन", "( प्र.नं.", "( प्र . नं .", "( प ्र . नं .", "(प्ररकण नं.", "(प्रकराण नं."] | |
| KEYWORDS_3 = ["निवेदक", "वादी", "पुनरावेदक", "निबेदक", "पुनरावदेक", "निवेदिका", "निवेदीका", "निवदेक", "न ि वेदक ः", "नि वेदक ः", "पुनरावेदन", "पुनरवेदिका", "पुनरावेदिका", "पुनरावेदीका", "बादि", "पुनराबेदक", "प्रतिबादी", "पुनरावेक", "अपीलाट", "निवेदनक", "उजुरवाला", "अपिलबाट", "अपिलाट"] | |
| KEYWORDS_4 = ["विपक्षी", "प्रतिवादी", "प्रत्यर्थी", "बिपक्षी", "विपक्षी ः", "पिपक्षी", "प्रत्यार्थी", "विपक्ष", "रेस्पोण्डेण्ट", "रेस्पोन्डेन्ट", "प्रत्यथी"] | |
| KEYWORDS_5 = ["विषय", "मुद्दा", "बिषय", "मूद्दा", "मुद्द", "मद्दा", "विपक्ष", "मुद्धा", "मुद् दा"] | |
| KEYWORDS_6 = ["अदालत", "इजलास", "इजालास", "इजलाश", "बेञ्च"] | |
| KEYWORDS_7 = ["आदेश", "फैसला", "फैसलमा", "निर्णय", "फै सला", "मुद्दा"] | |
| KEYWORDS_8 = ["न्यायाधीश", "माननीय", "न्यायधीश", "न्यायाधीस", "न्ययाधीश", "न्यायाधिश", "न्यायाधी", "न्यानायधीश", "नयायाधीश", "न्यायाधधिश", "नयाधश"] | |
| KEYWORDS_9 = [ "विरूद्ध", "बिरूद्ध", "विरुद्ध", "बिरुद्ध"] | |
| KEYWORDS_10 = ["AP", "FN", "RE", "RI", "LE", "RV", "NF", "CI", "CR", "RC", "SA", "MS", "ND", "RB", "CF", "DF", "RF", "WO", "WH", "WS", "WF", "WC", "CC", "EC"] | |
| # Extract court information | |
| while ind < n: | |
| text = tags[ind].get_text(strip=True) | |
| if text and any(kw in text for kw in KEYWORDS_6): | |
| details["अदालत / इजलास"] = text | |
| ind += 1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract judges | |
| judges = [] | |
| while ind < n: | |
| text = tags[ind].get_text(strip=True) | |
| if text: | |
| if any(kw in text for kw in KEYWORDS_8): | |
| judges.append(text) | |
| if any(text.startswith(kw) for kw in KEYWORDS_7) and ("मिति" in text or "मिती" in text): | |
| details["न्यायाधीश"] = judges | |
| details["आदेश मिति"] = text | |
| ind += 1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Extract case details | |
| bisaya_before_kas_no = False | |
| while ind < n: | |
| text = tags[ind].get_text(strip=True) | |
| if text: | |
| if any(kw in text for kw in KEYWORDS_5): | |
| bisaya_before_kas_no = True | |
| details["विषय"] = text | |
| ind += 1 | |
| break | |
| details["केस_नम्बर"] = text | |
| break | |
| ind += 1 | |
| if ind > n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Handle different case structures | |
| if bisaya_before_kas_no: | |
| case_no = [] | |
| appellant = [] | |
| opposition = [] | |
| temp_flag = True | |
| while temp_flag and ind < n: | |
| # Extract case number | |
| while ind < n: | |
| text = tags[ind].get_text(strip=True) | |
| if text: | |
| case_no.append(text) | |
| ind += 1 | |
| break | |
| ind += 1 | |
| # Extract appellant | |
| while ind < n: | |
| text = tags[ind].get_text(strip=True) | |
| if any(kw in text for kw in KEYWORDS_3): | |
| if any(kw2 == text for kw2 in KEYWORDS_3): | |
| ind+=1 | |
| text = tags[ind].get_text(strip=True) | |
| appellant.append(text) | |
| ind += 1 | |
| break | |
| ind += 1 | |
| # Extract opposition | |
| while ind < n: | |
| text = tags[ind].get_text(strip=True) | |
| if any(kw in text for kw in KEYWORDS_4): | |
| if any(kw2 == text for kw2 in KEYWORDS_4): | |
| ind += 1 | |
| text = tags[ind].get_text(strip=True) | |
| opposition.append(text) | |
| ind += 1 | |
| break | |
| ind += 1 | |
| # Check for end condition | |
| temp_ind = ind | |
| for tag in tags[temp_ind:]: | |
| text = tag.get_text(strip=True) | |
| if any(kw in text for kw in KEYWORDS_2): | |
| temp_flag = False | |
| details["केस_नम्बर"] = case_no | |
| details["निवेदक"] = appellant | |
| details["विपक्षी"] = opposition | |
| break | |
| elif any(kw == text for kw in KEYWORDS_9): | |
| break | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| else: | |
| # Standard case structure | |
| while ind < n: | |
| text = tags[ind].get_text(strip=True) | |
| if any(kw in text for kw in KEYWORDS_5): | |
| details["विषय"] = text | |
| ind += 1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| while ind < n: | |
| text = tags[ind].get_text(strip=True) | |
| if any(kw in text for kw in KEYWORDS_3): | |
| if any(kw2 == text for kw2 in KEYWORDS_3): | |
| ind+=1 | |
| text = tags[ind].get_text(strip=True) | |
| details["निवेदक"] = text | |
| ind += 1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| while ind < n: | |
| text = tags[ind].get_text(strip=True) | |
| if any(kw in text for kw in KEYWORDS_4): | |
| if any(kw2 == text for kw2 in KEYWORDS_4): | |
| ind+=1 | |
| text = tags[ind].get_text(strip=True) | |
| details["विपक्षी"] = text | |
| ind += 1 | |
| break | |
| ind += 1 | |
| if ind >= n: | |
| ind = temp_ind_32 | |
| else: | |
| temp_ind_32 = ind | |
| # Clean up extracted text | |
| # self.clean_extracted_details(details, bisaya_before_kas_no) | |
| # Extract prakarans and tahar | |
| prakarans = [] | |
| prev = "" | |
| tahar = [] | |
| temp_flag_tahar = False | |
| for tag in tags[ind:]: | |
| text = tag.get_text(separator=' ', strip=True) | |
| if text: | |
| #if "§" in text or any(kw in text for kw in KEYWORDS_2): | |
| #prakarans.append(text) | |
| if "§" in text or any(text.startswith(kw) for kw in KEYWORDS_2) or "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if any(text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 23: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(text) | |
| prev = "" | |
| if "§" in text: | |
| prakarans.append(text) | |
| if "फैसला"==text or "आदेश"==text or "फैसलाः"==text: | |
| if not prakarans: | |
| prakarans.append(prev) | |
| else: | |
| prev = prev + " " + text if prev else text | |
| if text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(text) | |
| # Process list items | |
| next_sib = tag.find_next_sibling() | |
| while next_sib and next_sib.name in ['ul', 'ol']: | |
| for li in next_sib.find_all('li'): | |
| li_text = li.get_text(separator=' ', strip=True) | |
| if li_text: | |
| if any(li_text.startswith(kw) for kw in KEYWORDS_2): | |
| if prev: # IMPROVEMENT 24: Only append if prev has content | |
| prakarans.append(prev) | |
| prakarans.append(li_text) | |
| prev = "" | |
| else: | |
| prev = prev + " " + li_text if prev else li_text | |
| if li_text in ["फैसला", "आदेश", "फैसलाः"] or temp_flag_tahar: | |
| temp_flag_tahar = True | |
| tahar.append(li_text) | |
| next_sib = next_sib.find_next_sibling() | |
| details["प्रकरण"] = prakarans | |
| details["ठहर"] = tahar | |
| # Get HTML file path | |
| html_file_path = "" | |
| if mudda_type and sal: | |
| filename = self.generate_html_filename(url, mudda_type, sal) | |
| html_file_path = os.path.join(self.html_folder, filename) | |
| # Combine all data, handling lists and strings appropriately | |
| data = { | |
| "लिङ्क": url, | |
| "निर्णय नं.": decision_title, | |
| "भाग": bhaag, | |
| "मुद्दाको किसिम": mudda_type, | |
| "साल": saal, | |
| "महिना": mahina, | |
| "अंक": anka, | |
| "फैसला मिति": f"'{decision_date}'", | |
| "अदालत / इजलास": details.get("अदालत / इजलास", "N/A"), | |
| "न्यायाधीश": json.dumps(details.get("न्यायाधीश", []), ensure_ascii=False), | |
| "आदेश मिति": details.get("आदेश मिति", "N/A"), | |
| "केस_नम्बर": json.dumps(details.get("केस_नम्बर", []), ensure_ascii=False) if isinstance(details.get("केस_नम्बर"), list) else details.get("केस_नम्बर", "N/A"), | |
| "विषय": details.get("विषय", "N/A"), | |
| "निवेदक": json.dumps(details.get("निवेदक", []), ensure_ascii=False) if isinstance(details.get("निवेदक"), list) else details.get("निवेदक", "N/A"), | |
| "विपक्षी": json.dumps(details.get("विपक्षी", []), ensure_ascii=False) if isinstance(details.get("विपक्षी"), list) else details.get("विपक्षी", "N/A"), | |
| "प्रकरण": json.dumps(details.get("प्रकरण", []), ensure_ascii=False), | |
| "ठहर": json.dumps(details.get("ठहर", []), ensure_ascii=False), | |
| "html_file_path": html_file_path | |
| } | |
| # Save to SQLite | |
| self.save_to_sqlite(data) | |
| print(f"{url} - Successfully Scraped and Entered") | |
| return True | |
| except Exception as e: | |
| print(f"Error scraping {url}: {e}") | |
| return False | |
| def save_to_sqlite(self, data): | |
| """Save data to SQLite database""" | |
| cursor = self.conn.cursor() | |
| try: | |
| cursor.execute(''' | |
| INSERT OR REPLACE INTO cases ( | |
| लिङ्क, निर्णय_नं, भाग, मुद्दाको_किसिम, साल, महिना, अंक, फैसला_मिति, | |
| अदालत_वा_इजलास, न्यायाधीश, आदेश_मिति, केस_नम्बर, विषय, निवेदक, विपक्षी, | |
| प्रकरण, ठहर, html_file_path | |
| ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| ''', ( | |
| data["लिङ्क"], data["निर्णय नं."], data["भाग"], data["मुद्दाको किसिम"], | |
| data["साल"], data["महिना"], data["अंक"], data["फैसला मिति"], | |
| data["अदालत / इजलास"], data["न्यायाधीश"], data["आदेश मिति"], data["केस_नम्बर"], | |
| data["विषय"], data["निवेदक"], data["विपक्षी"], data["प्रकरण"], data["ठहर"], | |
| data["html_file_path"] | |
| )) | |
| self.conn.commit() | |
| except sqlite3.Error as e: | |
| print(f"Database error: {e}") | |
| raise | |
| def save_failed_links(self, failed_links, mudda_type, sal, error_msg="Unknown error"): | |
| """Save failed links to SQLite database""" | |
| if failed_links: | |
| cursor = self.conn.cursor() | |
| for link in failed_links: | |
| try: | |
| cursor.execute(''' | |
| INSERT INTO failed_links (मुद्दाको_किसिम, साल, लिङ्क, error_message, retry_count) | |
| VALUES (?, ?, ?, ?, ?) | |
| ''', (mudda_type, sal, link, error_msg, 1)) | |
| except sqlite3.Error as e: | |
| print(f"Error saving failed link {link}: {e}") | |
| try: | |
| self.conn.commit() | |
| except sqlite3.Error as e: | |
| print(f"Error committing failed links: {e}") | |
| def test_single_link(self, url, mudda_type=None, sal=None, use_saved=True): | |
| """Test scraping a single link""" | |
| print(f"Testing single link: {url}") | |
| # If mudda_type and sal not provided, try to extract from existing data or filename | |
| if not mudda_type or not sal: | |
| cursor = self.conn.cursor() | |
| cursor.execute('SELECT मुद्दाको_किसिम, साल FROM cases WHERE लिङ्क = ?', (url,)) | |
| result = cursor.fetchone() | |
| if result: | |
| mudda_type, sal = result | |
| print(f"Found existing data: mudda_type={mudda_type}, sal={sal}") | |
| if not mudda_type or not sal: | |
| print("Warning: mudda_type and sal not provided and couldn't be determined from existing data") | |
| print("Using generic scraping without HTML file management") | |
| success = self.scrape_case_details_generic(url, mudda_type, sal, use_saved) | |
| if success: | |
| print("✓ Successfully scraped and saved to database") | |
| else: | |
| print("✗ Failed to scrape") | |
| return success | |
| def test_saved_html_files(self, mudda_type=None, sal=None, limit=None): | |
| """Test scraping from saved HTML files""" | |
| html_files = self.get_saved_html_files_by_criteria(mudda_type, sal) | |
| if not html_files: | |
| print("No saved HTML files found matching criteria") | |
| return | |
| print(f"Found {len(html_files)} saved HTML files") | |
| if limit: | |
| html_files = html_files[:limit] | |
| print(f"Testing first {limit} files") | |
| successful_count = 0 | |
| failed_count = 0 | |
| for html_file in html_files: | |
| file_mudda_type, file_sal, link_number = self.extract_info_from_filename(html_file) | |
| if not file_mudda_type or not file_sal: | |
| print(f"Could not extract info from filename: {html_file}") | |
| failed_count += 1 | |
| continue | |
| # Reconstruct URL (this is a simplified approach) | |
| url = f"https://nkp.gov.np/full_detail/{link_number}" | |
| print(f"Testing {html_file} -> {file_mudda_type}, {file_sal}") | |
| success = self.scrape_case_details_generic(url, file_mudda_type, file_sal, use_saved=True) | |
| if success: | |
| successful_count += 1 | |
| else: | |
| failed_count += 1 | |
| print(f"\nTest Results:") | |
| print(f"✓ Successful: {successful_count}") | |
| print(f"✗ Failed: {failed_count}") | |
| print(f"Total: {len(html_files)}") | |
| def run_scraper(self, mudda_type, sal, use_saved=True): | |
| """Main method to run the scraper""" | |
| print(f"Starting scraper for mudda_type: {mudda_type}, sal: {sal}") | |
| print(f"Using database: {self.output_db}") | |
| print(f"HTML folder: {self.html_folder}") | |
| print(f"Use saved HTML files: {use_saved}") | |
| # Validate inputs | |
| if mudda_type not in self.mudda_type_arr: | |
| raise ValueError(f"Invalid mudda_type. Must be one of: {self.mudda_type_arr}") | |
| # Generate search URL | |
| try: | |
| search_url = self.search_url(mudda_type, sal) | |
| print(f"Search URL: {search_url}") | |
| except Exception as e: | |
| print(f"Error generating search URL: {e}") | |
| return | |
| # Get all case URLs | |
| print("Fetching all case URLs...") | |
| case_urls = self.get_all_pages(search_url, mudda_type, sal, use_saved) | |
| if not case_urls: | |
| print("No case URLs found!") | |
| return | |
| print(f"Found {len(case_urls)} case URLs to scrape") | |
| # Scrape each case | |
| successful_count = 0 | |
| failed_links = [] | |
| for i, url in enumerate(case_urls, 1): | |
| print(f"Processing {i}/{len(case_urls)}: {url}") | |
| success = self.scrape_case_details_generic(url, mudda_type, sal, use_saved) | |
| if success: | |
| successful_count += 1 | |
| else: | |
| failed_links.append(url) | |
| # Add delay between requests only if downloading from web | |
| if not use_saved: | |
| time.sleep(2) | |
| # Retry failed links once | |
| if failed_links: | |
| print(f"\nRetrying {len(failed_links)} failed links...") | |
| still_failed = [] | |
| for i, url in enumerate(failed_links, 1): | |
| print(f"Retrying {i}/{len(failed_links)}: {url}") | |
| success = self.scrape_case_details_generic(url, mudda_type, sal, use_saved=False) # Force web download on retry | |
| if success: | |
| successful_count += 1 | |
| else: | |
| still_failed.append(url) | |
| time.sleep(2) | |
| # Save permanently failed links | |
| if still_failed: | |
| self.save_failed_links(still_failed, mudda_type, sal, "Failed after retry") | |
| print(f"\nFinal Results:") | |
| print(f"Total links found: {len(case_urls)}") | |
| print(f"Successfully scraped: {successful_count}") | |
| print(f"Failed to scrape: {len(still_failed)}") | |
| if still_failed: | |
| print(f"Failed links saved to database: failed_links table") | |
| else: | |
| print(f"\nResults:") | |
| print(f"Total links found: {len(case_urls)}") | |
| print(f"Successfully scraped: {successful_count}") | |
| print(f"Scraped data saved to SQLite database: {self.output_db}") | |
| def close(self): | |
| """Explicitly close the database connection""" | |
| if hasattr(self, 'conn'): | |
| self.conn.close() | |
| def __del__(self): | |
| """Close SQLite connection when the object is destroyed""" | |
| self.close() | |
| def create_parser(): | |
| """Create command line argument parser""" | |
| parser = argparse.ArgumentParser( | |
| description="Legal Case Scraper for Nepal Kanoon Patrika", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Scrape specific mudda_type and year | |
| python app.py --mudda_type "दुनियाबादी देवानी" --nepali_year "२०७३" --database_name "app_test_db.db" | |
| # Test a specific link | |
| python app.py --test_link "https://nkp.gov.np/8035" --mudda_type "दुनियाबादी देवानी" --nepali_year "२०७३" | |
| # Test saved HTML files | |
| python app.py --test_saved --nepali_year "२०७३" --limit 5 | |
| # Use saved HTML files for scraping (faster) | |
| python app.py --mudda_type "दुनियाबादी देवानी" --nepali_year "२०७३" --use_saved | |
| # List available mudda types | |
| python app.py --list_mudda_types | |
| """ | |
| ) | |
| parser.add_argument('--mudda_type', type=str, | |
| help='Mudda type (e.g., "दुनियाबादी देवानी")') | |
| parser.add_argument('--nepali_year', type=str, | |
| help='Nepali year (e.g., "२०७३")') | |
| parser.add_argument('--database_name', type=str, default='legal_cases_2.db', | |
| help='SQLite database filename (default: legal_cases_2.db)') | |
| parser.add_argument('--html_folder', type=str, default='scraped_html', | |
| help='Folder to store HTML files (default: scraped_html)') | |
| parser.add_argument('--use_saved', action='store_true', | |
| help='Use saved HTML files when available (faster)') | |
| parser.add_argument('--test_link', type=str, | |
| help='Test scraping a specific link') | |
| parser.add_argument('--test_saved', action='store_true', | |
| help='Test scraping from saved HTML files') | |
| parser.add_argument('--limit', type=int, | |
| help='Limit number of files to test (use with --test_saved)') | |
| parser.add_argument('--list_mudda_types', action='store_true', | |
| help='List all available mudda types') | |
| return parser | |
| def main(): | |
| """Main function to run the application""" | |
| parser = create_parser() | |
| args = parser.parse_args() | |
| # List mudda types if requested | |
| if args.list_mudda_types: | |
| temp_scraper = LegalCaseScraper() | |
| print("Available mudda_type options:") | |
| for i, option in enumerate(temp_scraper.mudda_type_arr, 1): | |
| print(f"{i}. {option}") | |
| temp_scraper.close() | |
| return | |
| # Create the scraper | |
| scraper = LegalCaseScraper( | |
| output_db=args.database_name, | |
| html_folder=args.html_folder | |
| ) | |
| try: | |
| # Test single link | |
| if args.test_link: | |
| success = scraper.test_single_link( | |
| args.test_link, | |
| args.mudda_type, | |
| args.nepali_year, | |
| use_saved=args.use_saved | |
| ) | |
| return | |
| # Test saved HTML files | |
| if args.test_saved: | |
| scraper.test_saved_html_files( | |
| mudda_type=args.mudda_type, | |
| sal=args.nepali_year, | |
| limit=args.limit | |
| ) | |
| return | |
| # Regular scraping | |
| if not args.mudda_type or not args.nepali_year: | |
| print("Error: --mudda_type and --nepali_year are required for scraping") | |
| print("Use --help for usage examples") | |
| return | |
| scraper.run_scraper( | |
| mudda_type=args.mudda_type, | |
| sal=args.nepali_year, | |
| use_saved=args.use_saved | |
| ) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| finally: | |
| scraper.close() | |
| if __name__ == "__main__": | |
| main() | |