diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,3151 +1,278 @@ -# -*- coding: utf-8 -*- -"""Copy of FindSpecsTrial(Retrieving+boundingBoxes)-InitialMarkups(ALL)_CleanedUp.ipynb - -Automatically generated by Colab. - -Original file is located at - https://colab.research.google.com/drive/12XfVkmKmN3oVjHhLVE0_GgkftgArFEK2 -""" -baselink='https://findconsole-initialmarkups.hf.space/view-pdf?' - -newlink='https://findconsole-initialmarkups.hf.space/view-highlight?' -tobebilledonlyLink='https://findconsole-initialmarkups.hf.space/view-pdf-tobebilled?' - - -from urllib.parse import urlparse, unquote -import os -from io import BytesIO -import re -import requests -import pandas as pd -import fitz # PyMuPDF -import re -import urllib.parse -import pandas as pd -import math -import random +from flask import Flask, request, jsonify, abort , render_template , send_file +import tsadropboxretrieval import json -from datetime import datetime -from collections import defaultdict, Counter -import difflib -from fuzzywuzzy import fuzz - -def filteredJsons(pdf_path,filteredjsonsfromrawan): - # for heading in subjects: - extract_section_under_headerRawan (pdf_path=pdf_path,listofheadingsfromrawan=filteredjsonsfromrawan) - - - - -def get_regular_font_size_and_color(doc): - font_sizes = [] - colors = [] - fonts = [] - - # Loop through all pages - for page_num in range(len(doc)): - page = doc.load_page(page_num) - for span in page.get_text("dict")["blocks"]: - if "lines" in span: - for line in span["lines"]: - for span in line["spans"]: - font_sizes.append(span['size']) - colors.append(span['color']) - fonts.append(span['font']) - - # Get the most common font size, color, and font - most_common_font_size = Counter(font_sizes).most_common(1)[0][0] if font_sizes else None - most_common_color = Counter(colors).most_common(1)[0][0] if colors else None - most_common_font = Counter(fonts).most_common(1)[0][0] if fonts else None - - return most_common_font_size, most_common_color, most_common_font - -def normalize_text(text): - if text is None: - return "" - return re.sub(r'\s+', ' ', text.strip().lower()) - -def get_spaced_text_from_spans(spans): - return normalize_text(" ".join(span["text"].strip() for span in spans)) - -def is_header(span, most_common_font_size, most_common_color, most_common_font): - fontname = span.get("font", "").lower() - # is_italic = "italic" in fontname or "oblique" in fontname - is_bold = "bold" in fontname or span.get("bold", False) - return ( - ( - span["size"] > most_common_font_size or - span["font"].lower() != most_common_font.lower() or - (is_bold and span["size"] > most_common_font_size ) - ) - ) - -def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5): - for (p, y) in grouped_dict: - if pageNum is not None and p != pageNum: - continue - if abs(y - span_y) <= threshold: - return (p, y) - return (pageNum, span_y) - -def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin): - print("Font baseline:", most_common_font_size, most_common_color, most_common_font) - - grouped_headers = defaultdict(list) - spans = [] - line_merge_threshold = 1.5 # Maximum vertical distance between lines to consider as part of same header - - for pageNum in range(len(doc)): - if pageNum in toc_pages: - continue - page = doc.load_page(pageNum) - page_height = page.rect.height - text_instances = page.get_text("dict") - - # First pass: collect all potential header spans - potential_header_spans = [] - for block in text_instances['blocks']: - if block['type'] != 0: - continue - - for line in block['lines']: - for span in line['spans']: - span_y0 = span['bbox'][1] - span_y1 = span['bbox'][3] - - if span_y0 < top_margin or span_y1 > (page_height - bottom_margin): - continue - - span_text = normalize_text(span.get('text', '')) - if not span_text: - continue - if span_text.startswith('http://www') or span_text.startswith('www'): - continue - if any(( - 'page' in span_text, - not re.search(r'[a-z0-9]', span_text), - 'end of section' in span_text, - re.search(r'page\s+\d+\s+of\s+\d+', span_text), - re.search(r'\b(?:\d{1,2}[/-])?\d{1,2}[/-]\d{2,4}\b', span_text), - # re.search(r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', span_text), - 'specification:' in span_text - )): - continue - - cleaned_text = re.sub(r'[.\-]{4,}.*$', '', span_text).strip() - cleaned_text = normalize_text(cleaned_text) - - if is_header(span, most_common_font_size, most_common_color, most_common_font): - potential_header_spans.append({ - 'text': cleaned_text, - 'size': span['size'], - 'pageNum': pageNum, - 'y0': span_y0, - 'y1': span_y1, - 'x0': span['bbox'][0], - 'x1': span['bbox'][2], - 'span': span - }) - - # Sort spans by vertical position (top to bottom) - potential_header_spans.sort(key=lambda s: (s['pageNum'], s['y0'])) - - # Second pass: group spans that are vertically close and likely part of same header - i = 0 - while i < len(potential_header_spans): - current = potential_header_spans[i] - header_text = current['text'] - header_size = current['size'] - header_page = current['pageNum'] - min_y = current['y0'] - max_y = current['y1'] - spans_group = [current['span']] - - # Look ahead to find adjacent lines that might be part of same header - j = i + 1 - while j < len(potential_header_spans): - next_span = potential_header_spans[j] - # Check if on same page and vertically close with similar styling - if (next_span['pageNum'] == header_page and - next_span['y0'] - max_y < line_merge_threshold and - abs(next_span['size'] - header_size) < 0.5): - header_text += " " + next_span['text'] - max_y = next_span['y1'] - spans_group.append(next_span['span']) - j += 1 - else: - break - - # Add the merged header - grouped_headers[(header_page, min_y)].append({ - "text": header_text.strip(), - "size": header_size, - "pageNum": header_page, - "spans": spans_group - }) - spans.extend(spans_group) - i = j # Skip the spans we've already processed - - # Prepare final headers list - headers = [] - for (pageNum, y), header_groups in sorted(grouped_headers.items()): - for group in header_groups: - headers.append([ - group['text'], - group['size'], - group['pageNum'], - y - ]) - - font_sizes = [size for _, size, _, _ in headers] - font_size_counts = Counter(font_sizes) - - # Filter font sizes that appear at least 3 times - valid_font_sizes = [size for size, count in font_size_counts.items() if count >= 3] - - # Sort in descending order - valid_font_sizes_sorted = sorted(valid_font_sizes, reverse=True) - - # If only 2 sizes, repeat the second one - if len(valid_font_sizes_sorted) == 2: - top_3_font_sizes = [valid_font_sizes_sorted[0], valid_font_sizes_sorted[1], valid_font_sizes_sorted[1]] +import Find_Hyperlinking_text +import findspecsv1 +import InitialMarkups +import requests +from io import BytesIO +import datetime +import time +from threading import Thread +import urllib +from urllib.parse import quote +app = Flask(__name__) + +pageNumTextFound = 0 +BASE_URL = "https://findconsole-initialmarkups.hf.space" +# Simulate a backend readiness flag (replace with actual check if possible) +backend_ready = False +# @app.route("/") +# def thismain(): +# print('Home page loaded') +# return render_template("gui.html") + +@app.route("/keepaliveapii", methods=["GET", "POST"]) +def keepaliveapi(): + try: + print('Keepalive pinged') + return 'alivee' + except Exception as error: + print('Error in keepalive:', error) + return jsonify(status="error", message=str(error)), 500 + + + +@app.route("/") +def home(): + global backend_ready + # If backend not ready, show loading page + if not backend_ready: + return render_template("wake_and_redirect.html") else: - top_3_font_sizes = valid_font_sizes_sorted[:3] - - # Get the smallest font size among valid ones - smallest_font_size = min(valid_font_sizes) if valid_font_sizes else None + # Redirect to your PDF viewer route when ready + return redirect(url_for("view_pdf", **request.args)) +################################################################################################################################################################ +################################################################################################################################################################ +##################### Main console ########################################################################################################### +################################################################################################################################################################ +################################################################################################################################################################ + +@app.route('/view-pdf', methods=['GET']) +def download_pdf(): + # Parse and decode pdfLink safely + full_query_string = request.query_string.decode() + parsed_params = urllib.parse.parse_qs(full_query_string) + encoded_pdf_link = parsed_params.get('pdfLink', [None])[0] - return headers, top_3_font_sizes, smallest_font_size, spans - -def is_numbered(text): - return bool(re.match(r'^\d', text.strip())) - -def is_similar(a, b, threshold=0.85): - return difflib.SequenceMatcher(None, a, b).ratio() > threshold - -def normalize(text): - text = text.lower() - text = re.sub(r'\.{2,}', '', text) # remove long dots - text = re.sub(r'\s+', ' ', text) # replace multiple spaces with one - return text.strip() - -def clean_toc_entry(toc_text): - """Remove page numbers and formatting from TOC entries""" - # Remove everything after last sequence of dots/whitespace followed by digits - return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ') - -def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70): - # Extract headers with margin handling - headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers( - doc, - toc_pages=toc_pages, - most_common_font_size=most_common_font_size, - most_common_color=most_common_color, - most_common_font=most_common_font, - top_margin=top_margin, - bottom_margin=bottom_margin + if not encoded_pdf_link: + return "Missing pdfLink parameter.", 400 + + # Decode the URL-encoded PDF link + pdf_link = urllib.parse.unquote(encoded_pdf_link) + print("Extracted PDF Link:", pdf_link) + + try: + # Use InitialMarkups to extract content + pdf_content = InitialMarkups.extract_section_under_header(pdf_link)[0] + except Exception as e: + print("Error during PDF extraction:", e) + return "PDF could not be processed.", 500 + + if pdf_content is None or not pdf_content.startswith(b"%PDF"): + return "PDF content not found or broken.", 404 + + pdf_bytes = BytesIO(pdf_content) + return send_file( + pdf_bytes, + mimetype='application/pdf', + as_attachment=False, + download_name=f"annotated_page_{pageNumTextFound}.pdf" ) - # Step 1: Collect and filter potential headers - headers = [] - seen_headers = set() - - # First extract TOC entries to get exact level 0 header texts - toc_entries = {} - for pno in toc_pages: - page = doc.load_page(pno) - toc_text = page.get_text() - for line in toc_text.split('\n'): - clean_line = line.strip() - if clean_line: - norm_line = normalize(clean_line) - toc_entries[norm_line] = clean_line # Store original text - - for h in headers_list: - text, size, pageNum, y = h[:4] - page = doc.load_page(pageNum) - page_height = page.rect.height - - # Skip margin areas - if y < top_margin or y > (page_height - bottom_margin): - continue - - norm_text = normalize(text) - if len(norm_text) > 2 and size >= most_common_font_size: - headers.append({ - "text": text, - "page": pageNum, - "y": y, - "size": size, - "bold": h[4] if len(h) > 4 else False, - # "italic": h[5] if len(h) > 5 else False, - "color": h[6] if len(h) > 6 else None, - "font": h[7] if len(h) > 7 else None, - "children": [], - "is_numbered": is_numbered(text), - "original_size": size, - "norm_text": norm_text, - "level": -1 # Initialize as unassigned - }) - - # Sort by page and vertical position - headers.sort(key=lambda h: (h['page'], h['y'])) - # Step 2: Detect consecutive headers and assign levels - i = 0 - while i < len(headers) - 1: - current = headers[i] - next_header = headers[i+1] - - # Check if they are on the same page and very close vertically (likely consecutive lines) - if (current['page'] == next_header['page'] and - abs(current['y'] - next_header['y']) < 20): # 20pt threshold for "same line" - - # Case 1: Both unassigned - make current level 1 and next level 2 - if current['level'] == -1 and next_header['level'] == -1: - current['level'] = 1 - next_header['level'] = 2 - i += 1 # Skip next header since we processed it - - # Case 2: Current unassigned, next assigned - make current one level above - elif current['level'] == -1 and next_header['level'] != -1: - current['level'] = max(1, next_header['level'] - 1) - - # Case 3: Current assigned, next unassigned - make next one level below - elif current['level'] != -1 and next_header['level'] == -1: - next_header['level'] = current['level'] + 1 - i += 1 # Skip next header since we processed it - i += 1 - # Step 2: Identify level 0 headers (largest and in TOC) - # max_size = max(h['size'] for h in headers) if headers else 0 - max_size,subheaderSize,nbsheadersize=top_3_font_sizes - print(max_size) - toc_text_match=[] - # Improved TOC matching with exact and substring matching - toc_matches = [] - for h in headers: - norm_text = h['norm_text'] - matching_toc_texts = [] - - # Check both exact matches and substring matches - for toc_norm, toc_text in toc_entries.items(): - # Exact match case - if norm_text == toc_norm and len(toc_text)>4 and h['size']==max_size: - matching_toc_texts.append(toc_text) - # Substring match case (header is substring of TOC entry) - elif norm_text in toc_norm and len(toc_text)>4 and h['size']==max_size: - matching_toc_texts.append(toc_text) - # Substring match case (TOC entry is substring of header) - elif toc_norm in norm_text and len(toc_text)>4 and h['size']==max_size: - matching_toc_texts.append(toc_text) - - if matching_toc_texts and h['size'] >= max_size * 0.9: - best_match = max(matching_toc_texts, - key=lambda x: (len(x), -len(x.replace(norm_text, '')))) - h['text'] = normalize_text(clean_toc_entry(best_match)) - h['level'] = 0 - if h['text'] not in toc_text_match: - toc_matches.append(h) - toc_text_match.append(h['text']) - elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75: - print(h['text'],matching_toc_texts) - headers.remove(h) - continue - - - # Remove duplicates - keep only first occurrence of each level 0 header - unique_level0 = [] - seen_level0 = set() - for h in toc_matches: - # Use the cleaned text for duplicate checking - cleaned_text = clean_toc_entry(h['text']) - norm_cleaned_text = normalize(cleaned_text) - - if norm_cleaned_text not in seen_level0: - seen_level0.add(norm_cleaned_text) - # Update the header text with cleaned version - h['text'] = cleaned_text - unique_level0.append(h) - print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})") - - # Step 3: Process headers under each level 0 to identify level 1 format - - # First, group headers by their level 0 parent - level0_headers = [h for h in headers if h['level'] == 0] - header_groups = [] - - for i, level0 in enumerate(level0_headers): - start_idx = headers.index(level0) - end_idx = headers.index(level0_headers[i+1]) if i+1 < len(level0_headers) else len(headers) - group = headers[start_idx:end_idx] - header_groups.append(group) - - # Now process each group to identify level 1 format - for group in header_groups: - level0 = group[0] - level1_candidates = [h for h in group[1:] if h['level'] == -1] - - if not level1_candidates: - continue - - # The first candidate is our reference level 1 - first_level1 = level1_candidates[0] - level1_format = { - 'font': first_level1['font'], - 'color': first_level1['color'], - 'starts_with_number': is_numbered(first_level1['text']), - 'size': first_level1['size'], - 'bold': first_level1['bold'] - # 'italic': first_level1['italic'] - } - - # Assign levels based on the reference format - for h in level1_candidates: - current_format = { - 'font': h['font'], - 'color': h['color'], - 'starts_with_number': is_numbered(h['text']), - 'size': h['size'], - 'bold': h['bold'] - # 'italic': h['italic'] - } - - # Compare with level1 format - if (current_format['font'] == level1_format['font'] and - current_format['color'] == level1_format['color'] and - current_format['starts_with_number'] == level1_format['starts_with_number'] and - abs(current_format['size'] - level1_format['size']) <= 0.1 and - current_format['bold'] == level1_format['bold'] ): #and - # current_format['italic'] == level1_format['italic']): - h['level'] = 1 - else: - h['level'] = 2 - - # Step 4: Assign levels to remaining unassigned headers - unassigned = [h for h in headers if h['level'] == -1] - if unassigned: - # Cluster by size with tolerance - sizes = sorted({h['size'] for h in unassigned}, reverse=True) - clusters = [] - - for size in sizes: - found_cluster = False - for cluster in clusters: - if abs(size - cluster['size']) <= max(size, cluster['size']) * 0.1: - cluster['headers'].extend([h for h in unassigned if abs(h['size'] - size) <= size * 0.1]) - found_cluster = True - break - if not found_cluster: - clusters.append({ - 'size': size, - 'headers': [h for h in unassigned if abs(h['size'] - size) <= size * 0.1] - }) - - # Assign levels starting from 1 - clusters.sort(key=lambda x: -x['size']) - for i, cluster in enumerate(clusters): - for h in cluster['headers']: - base_level = i + 1 - if h['bold']: - base_level = max(1, base_level - 1) - h['level'] = base_level - - # Step 5: Build hierarchy - root = [] - stack = [] - - # Create a set of normalized texts from unique_level0 to avoid duplicates - unique_level0_texts = {h['norm_text'] for h in unique_level0} - - # Filter out any headers from the original list that match unique_level0 headers - filtered_headers = [] - for h in headers: - if h['norm_text'] in unique_level0_texts and h not in unique_level0: - h['level'] = 0 - filtered_headers.append(h) - - # Combine all headers - unique_level0 first, then the filtered headers - all_headers = unique_level0 + filtered_headers - all_headers.sort(key=lambda h: (h['page'], h['y'])) - - # Track which level 0 headers we've already added - added_level0 = set() - - for header in all_headers: - if header['level'] < 0: - continue - - if header['level'] == 0: - norm_text = header['norm_text'] - if norm_text in added_level0: - continue - added_level0.add(norm_text) - - # Pop stack until we find a parent - while stack and stack[-1]['level'] >= header['level']: - stack.pop() - - current_parent = stack[-1] if stack else None - - if current_parent: - current_parent['children'].append(header) - else: - root.append(header) - - stack.append(header) - - # Step 6: Enforce proper nesting - def enforce_nesting(node_list, parent_level=-1): - for node in node_list: - if node['level'] <= parent_level: - node['level'] = parent_level + 1 - enforce_nesting(node['children'], node['level']) - - enforce_nesting(root) - root = [h for h in root if not (h['level'] == 0 and not h['children'])] - return root - -def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root): - def normalize(text): - return re.sub(r'\s+', ' ', text.strip().lower()) - - toc_text = "" - for pno in toc_pages: - page = doc.load_page(pno) - toc_text += page.get_text() - toc_text_normalized = normalize(toc_text) - - def is_level0_in_toc_text(header): - return header['level'] == 0 and normalize(header['text']) in toc_text_normalized - - if any(is_level0_in_toc_text(h) for h in root): - return # No change needed - - def increase_levels(node_list): - for node in node_list: - node['level'] += 1 - increase_levels(node['children']) - -def assign_numbers_to_headers(headers, prefix=None): - for idx, header in enumerate(headers, 1): - current_number = f"{prefix}.{idx}" if prefix else str(idx) - header["number"] = current_number - assign_numbers_to_headers(header["children"], current_number) - -def print_tree_with_numbers(headers, indent=0): - for header in headers: - size_info = f"size:{header['original_size']:.1f}" if 'original_size' in header else "" - print(" " * indent + - f"{header.get('number', '?')} {header['text']} " + - f"(Level {header['level']}, p:{header['page']+1}, {size_info})") - print_tree_with_numbers(header["children"], indent + 1) - - -def highlight_boxes(doc, highlights, stringtowrite, fixed_width=500): # Set your desired width here - for page_num, bbox in highlights.items(): - page = doc.load_page(page_num) - page_width = page.rect.width - - # Get original rect for vertical coordinates - orig_rect = fitz.Rect(bbox) - rect_height = orig_rect.height - if rect_height > 30: - if orig_rect.width > 10: - # Center horizontally using fixed width - center_x = page_width / 2 - new_x0 = center_x - fixed_width / 2 - new_x1 = center_x + fixed_width / 2 - new_rect = fitz.Rect(new_x0, orig_rect.y0, new_x1, orig_rect.y1) - - # Add highlight rectangle - annot = page.add_rect_annot(new_rect) - if stringtowrite.startswith('Not'): - annot.set_colors(stroke=(0.5, 0.5, 0.5), fill=(0.5, 0.5, 0.5)) - else: - annot.set_colors(stroke=(1, 1, 0), fill=(1, 1, 0)) - - annot.set_opacity(0.3) - annot.update() - # Add right-aligned freetext annotation inside the fixed-width box - text = '['+stringtowrite +']' - annot1 = page.add_freetext_annot( - new_rect, - text, - fontsize=15, - fontname='helv', - text_color=(1, 0, 0), - rotate=page.rotation, - align=2 # right alignment - ) - annot1.update() - -def get_leaf_headers_with_paths(listtoloop, path=None, output=None): - if path is None: - path = [] - if output is None: - output = [] - for header in listtoloop: - current_path = path + [header['text']] - if not header['children']: - if header['level'] != 0 and header['level'] != 1: - output.append((header, current_path)) - else: - get_leaf_headers_with_paths(header['children'], current_path, output) - return output - -# Add this helper function at the top of your code -def words_match_ratio(text1, text2): - words1 = set(text1.split()) - words2 = set(text2.split()) - if not words1 or not words2: - return 0.0 - common_words = words1 & words2 - return len(common_words) / len(words1) - -def same_start_word(s1, s2): - # Split both strings into words - words1 = s1.strip().split() - words2 = s2.strip().split() - - # Check if both have at least one word and compare the first ones - if words1 and words2: - return words1[0].lower() == words2[0].lower() - return False +@app.route('/api/process-data', methods=['POST']) +def receive_pdf_data(): + global pdf_content, pageNumTextFound + + # Get PDF link and keyword from finddata() + pdfLink = finddata() + + if not pdfLink : + return jsonify({"error": "'pdfLink' must be provided."}), 400 + + try: + print(pdfLink) + + + pdfbytes, pdf_document,tablepdfoutput= InitialMarkups.extract_section_under_header(pdfLink) + dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user') + + # Get metadata using the shared link + metadata = dbxTeam.sharing_get_shared_link_metadata(pdfLink) + dbPath='/TSA JOBS/ADR Test/FIND/' + pdflink= tsadropboxretrieval.uploadanyFile(doc=pdf_document,path=dbPath,pdfname=metadata.name) #doc=doc,pdfname=path,pdfpath=pdfpath+'Measured Plan/ + print('LINKS0',pdflink) + + dbPath='/TSA JOBS/ADR Test/FIND/' + tablepdfLink=tsadropboxretrieval.uploadanyFile(doc=tablepdfoutput,path=dbPath,pdfname=metadata.name.rsplit(".pdf", 1)[0] +' Markup Summary'+'.pdf') + print(f"PDF successfully uploaded to Dropbox at") + print('LINKS1',tablepdfLink) + return jsonify({ + "message": "PDF processed successfully.", + "PDF_MarkedUp": pdflink, + 'Table_PDF_Markup_Summary': tablepdfLink + }) + + except Exception as e: + return jsonify({"error": str(e)}), 500 +################################################################################################################################################################ +################################################################################################################################################################ +##################### Not to billed not markuped up ########################################################################################################### +################################################################################################################################################################ +################################################################################################################################################################ +@app.route('/findapitobebilled1', methods=['GET','POST']) +def findapitobebilled1(): + try: + print('In process [Try]') + data = request.get_json() + # Extracting values + pdfLink = data.get('filePath') + pdfbytes, pdf_document,tablepdfoutput, alltext_tobebilled= InitialMarkups.extract_section_under_header_tobebilledOnly(pdfLink) + + return jsonify(alltext_tobebilled) + except Exception as e: + return jsonify({"error": str(e)}), 500 - -def extract_section_under_header(pdf_path): - top_margin = 70 - bottom_margin = 50 - headertoContinue1 = False - headertoContinue2=False + +@app.route('/view-pdf-tobebilled', methods=['GET']) +def download_pdf_tobebilled(): + # Parse and decode pdfLink safely + full_query_string = request.query_string.decode() + parsed_params = urllib.parse.parse_qs(full_query_string) + encoded_pdf_link = parsed_params.get('pdfLink', [None])[0] - parsed_url = urlparse(pdf_path) - filename = os.path.basename(parsed_url.path) - filename = unquote(filename) # decode URL-encoded characters - - # Optimized URL handling - if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path): - pdf_path = pdf_path.replace('dl=0', 'dl=1') - - # Cache frequently used values - response = requests.get(pdf_path) - pdf_content = BytesIO(response.content) - if not pdf_content: - raise ValueError("No valid PDF content found.") - - doc = fitz.open(stream=pdf_content, filetype="pdf") - docHighlights = fitz.open(stream=pdf_content, filetype="pdf") - most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc) - - # Precompute regex patterns - dot_pattern = re.compile(r'\.{3,}') - url_pattern = re.compile(r'https?://\S+|www\.\S+') - - def get_toc_page_numbers(doc, max_pages_to_check=15): - toc_pages = [] - for page_num in range(min(len(doc), max_pages_to_check)): - page = doc.load_page(page_num) - blocks = page.get_text("dict")["blocks"] - - dot_line_count = 0 - for block in blocks: - for line in block.get("lines", []): - line_text = get_spaced_text_from_spans(line["spans"]).strip() - if dot_pattern.search(line_text): - dot_line_count += 1 - - if dot_line_count >= 3: - toc_pages.append(page_num) - - return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages - - toc_pages = get_toc_page_numbers(doc) - - headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers( - doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin + if not encoded_pdf_link: + return "Missing pdfLink parameter.", 400 + + # Decode the URL-encoded PDF link + pdf_link = urllib.parse.unquote(encoded_pdf_link) + print("Extracted PDF Link:", pdf_link) + + try: + # Use InitialMarkups to extract content + pdf_content = InitialMarkups.extract_section_under_header_tobebilledOnly(pdf_link)[0] + except Exception as e: + print("Error during PDF extraction:", e) + return "PDF could not be processed.", 500 + + if pdf_content is None or not pdf_content.startswith(b"%PDF"): + return "PDF content not found or broken.", 404 + + pdf_bytes = BytesIO(pdf_content) + return send_file( + pdf_bytes, + mimetype='application/pdf', + as_attachment=False, + download_name=f"annotated_page_{pageNumTextFound}.pdf" ) - hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font) - listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy) - print('listofHeaderstoMarkup',listofHeaderstoMarkup) - # Precompute all children headers once - allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup] - allchildrenheaders_set = set(allchildrenheaders) # For faster lookups - - df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"]) - dictionaryNBS={} - data_list_JSON = [] - - if len(top_3_font_sizes)==3: - mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes - elif len(top_3_font_sizes)==2: - mainHeaderFontSize= top_3_font_sizes[0] - subHeaderFontSize= top_3_font_sizes[1] - subsubheaderFontSize= top_3_font_sizes[1] - - print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages) - - # Preload all pages to avoid repeated loading - # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages] - - for heading_to_searchDict, paths in listofHeaderstoMarkup: - heading_to_search = heading_to_searchDict['text'] - heading_to_searchPageNum = heading_to_searchDict['page'] - - print('headertosearch', heading_to_search) - - # Initialize variables - headertoContinue1 = False - headertoContinue2 = False - matched_header_line = None - done = False - collecting = False - collected_lines = [] - page_highlights = {} - current_bbox = {} - last_y1s = {} - mainHeader = '' - subHeader = '' - matched_header_line_norm = heading_to_search - break_collecting = False - heading_norm = normalize_text(heading_to_search) - paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else [] - - for page_num in range(heading_to_searchPageNum,len(doc)): - if page_num in toc_pages: - continue - if break_collecting: - break - page=doc[page_num] - page_height = page.rect.height - blocks = page.get_text("dict")["blocks"] - - for block in blocks: - if break_collecting: - break - - lines = block.get("lines", []) - i = 0 - while i < len(lines): - if break_collecting: - break - - spans = lines[i].get("spans", []) - if not spans: - i += 1 - continue - - y0 = spans[0]["bbox"][1] - y1 = spans[0]["bbox"][3] - if y0 < top_margin or y1 > (page_height - bottom_margin): - i += 1 - continue - - line_text = get_spaced_text_from_spans(spans).lower() - line_text_norm = normalize_text(line_text) - - # Combine with next line if available - if i + 1 < len(lines): - next_spans = lines[i + 1].get("spans", []) - next_line_text = get_spaced_text_from_spans(next_spans).lower() - combined_line_norm = normalize_text(line_text + " " + next_line_text) - else: - combined_line_norm = line_text_norm - - # Check if we should continue processing - if combined_line_norm and combined_line_norm in paths[0]: - print(combined_line_norm) - headertoContinue1 = combined_line_norm - if combined_line_norm and combined_line_norm in paths[-2]: - print(combined_line_norm) - headertoContinue2 = combined_line_norm - if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - # Optimized header matching - existsfull = ( - ( combined_line_norm in allchildrenheaders_set or - combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm - ) - - # New word-based matching - current_line_words = set(combined_line_norm.split()) - heading_words = set(heading_norm.split()) - all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0 - - substring_match = ( - heading_norm in combined_line_norm or - combined_line_norm in heading_norm or - all_words_match # Include the new word-based matching - ) - # substring_match = ( - # heading_norm in combined_line_norm or - # combined_line_norm in heading_norm - # ) - - if (substring_match and existsfull and not collecting and - len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ): - - # Check header conditions more efficiently - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - if header_spans: - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - - - data_entry = { - "NBSLink": final_url, - "Subject": heading_to_search, - "Page": str(pageNumberFound), - "Author": "ADR", - "Creation Date": formatted_time, - "Layer": "Initial", - "Code": stringtowrite, - "head above 1": paths[-2], - "head above 2": paths[0], - "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename - } - data_list_JSON.append(data_entry) - - # Convert list to JSON - json_output = json.dumps(data_list_JSON, indent=4) - - print("Final URL:", final_url) - i += 2 - continue - else: - if (substring_match and not collecting and - len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ): - - # Calculate word match percentage - word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100 - - # Check if at least 70% of header words exist in this line - meets_word_threshold = word_match_percent >= 100 - - # Check header conditions (including word threshold) - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - - if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ): - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - print(f"📥 Start collecting after header: {combined_line_norm} " - f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - - - data_entry = { - "NBSLink": final_url, - "Subject": heading_to_search, - "Page": str(pageNumberFound), - "Author": "ADR", - "Creation Date": formatted_time, - "Layer": "Initial", - "Code": stringtowrite, - "head above 1": paths[-2], - "head above 2": paths[0], - "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename - } - data_list_JSON.append(data_entry) - - # Convert list to JSON - json_output = json.dumps(data_list_JSON, indent=4) - - print("Final URL:", final_url) - i += 2 - continue - if collecting: - norm_line = normalize_text(line_text) - - # Optimized URL check - if url_pattern.match(norm_line): - line_is_header = False - else: - line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans) - - if line_is_header: - header_font_size = max(span["size"] for span in spans) - is_probably_real_header = ( - header_font_size >= matched_header_font_size and - is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and - len(line_text.strip()) > 2 - ) - - if (norm_line != matched_header_line_norm and - norm_line != heading_norm and - is_probably_real_header): - if line_text not in heading_norm: - print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})") - collecting = False - done = True - headertoContinue1 = False - headertoContinue2=False - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - break_collecting = True - break - - if break_collecting: - break - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], line_bbox[0]), - min(cb[1], line_bbox[1]), - max(cb[2], line_bbox[2]), - max(cb[3], line_bbox[3]) - ] - else: - current_bbox[page_num] = line_bbox - - last_y1s[page_num] = line_bbox[3] - i += 1 - - if not done: - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True) - - pdf_bytes = BytesIO() - docHighlights.save(pdf_bytes) - print('JSONN',json_output) - return pdf_bytes.getvalue(), docHighlights , json_output - - - - -######################################################################################################################################################## -######################################################################################################################################################## - - -def extract_section_under_header_tobebilledOnly(pdf_path): - Alltext_Tobebilled='' - top_margin = 70 - bottom_margin = 50 - headertoContinue1 = False - headertoContinue2=False - - parsed_url = urlparse(pdf_path) - filename = os.path.basename(parsed_url.path) - filename = unquote(filename) # decode URL-encoded characters - - # Optimized URL handling - if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path): - pdf_path = pdf_path.replace('dl=0', 'dl=1') - - # Cache frequently used values - response = requests.get(pdf_path) - pdf_content = BytesIO(response.content) - if not pdf_content: - raise ValueError("No valid PDF content found.") - - doc = fitz.open(stream=pdf_content, filetype="pdf") - docHighlights = fitz.open(stream=pdf_content, filetype="pdf") - most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc) - - # Precompute regex patterns - dot_pattern = re.compile(r'\.{3,}') - url_pattern = re.compile(r'https?://\S+|www\.\S+') - - def get_toc_page_numbers(doc, max_pages_to_check=15): - toc_pages = [] - for page_num in range(min(len(doc), max_pages_to_check)): - page = doc.load_page(page_num) - blocks = page.get_text("dict")["blocks"] - - dot_line_count = 0 - for block in blocks: - for line in block.get("lines", []): - line_text = get_spaced_text_from_spans(line["spans"]).strip() - if dot_pattern.search(line_text): - dot_line_count += 1 - - if dot_line_count >= 3: - toc_pages.append(page_num) - - return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages - - toc_pages = get_toc_page_numbers(doc) - - headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers( - doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin +################################################################################################################################################################ +################################################################################################################################################################ +##################### For final markups - view one highlight at a time - not used yet ########################################################################################################### +################################################################################################################################################################ +################################################################################################################################################################ + + +@app.route('/view-highlight', methods=['GET','POST']) +def download_pdfHighlight(): + + # Manually parse the query parameters + full_query_string = request.query_string.decode() # Get raw query string + parsed_params = urllib.parse.parse_qs(full_query_string) # Parse it + # Extract pdfLink and keyword manually + pdf_link = parsed_params.get('pdfLink', [None])[0] + keyword = parsed_params.get('keyword', [None])[0] +# linktoreplace = [listofheadingsfromrawan["Link"]] + if not pdf_link : + return "Missing required parameters.", 400 + + # Decode the extracted values + pdf_link = urllib.parse.unquote(pdf_link) + + print("Extracted PDF Link:", pdf_link) + print("Extracted Keywords:", keyword) + createDF=False + global jsonoutput + matching_item = next((item for item in jsonoutput if item.get("Subject") == keyword), None) + + if matching_item: + page_number = int(matching_item.get("Page"))-1 + stringtowrite = matching_item.get("head above 1") + print(f"Page number for '{keyword}': {page_number}") + else: + page_number=0 + print("No match found.") + pdf_content = InitialMarkups.extract_section_under_headerRawan(pdf_link,keyword,page_number,stringtowrite)[0] + if pdf_content is None: + return "PDF content not found.", 404 + + pdf_bytes = BytesIO(pdf_content) + return send_file( + pdf_bytes, + mimetype='application/pdf', + as_attachment=False, + download_name=f"annotated_page_{pageNumTextFound}.pdf" ) - hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font) - listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy) - # print('listofHeaderstoMarkup',listofHeaderstoMarkup) - # Precompute all children headers once - allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup] - allchildrenheaders_set = set(allchildrenheaders) # For faster lookups - - df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"]) - dictionaryNBS={} - data_list_JSON = [] - - if len(top_3_font_sizes)==3: - mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes - elif len(top_3_font_sizes)==2: - mainHeaderFontSize= top_3_font_sizes[0] - subHeaderFontSize= top_3_font_sizes[1] - subsubheaderFontSize= top_3_font_sizes[1] - - # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages) - - # Preload all pages to avoid repeated loading - # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages] - - for heading_to_searchDict, paths in listofHeaderstoMarkup: - heading_to_search = heading_to_searchDict['text'] - heading_to_searchPageNum = heading_to_searchDict['page'] - - # print('headertosearch', heading_to_search) - - # Initialize variables - headertoContinue1 = False - headertoContinue2 = False - matched_header_line = None - done = False - collecting = False - collected_lines = [] - page_highlights = {} - current_bbox = {} - last_y1s = {} - mainHeader = '' - subHeader = '' - matched_header_line_norm = heading_to_search - break_collecting = False - heading_norm = normalize_text(heading_to_search) - paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else [] - - for page_num in range(heading_to_searchPageNum,len(doc)): - if page_num in toc_pages: - continue - if break_collecting: - break - page=doc[page_num] - page_height = page.rect.height - blocks = page.get_text("dict")["blocks"] - - for block in blocks: - if break_collecting: - break - - lines = block.get("lines", []) - i = 0 - while i < len(lines): - if break_collecting: - break - - spans = lines[i].get("spans", []) - if not spans: - i += 1 - continue - - y0 = spans[0]["bbox"][1] - y1 = spans[0]["bbox"][3] - if y0 < top_margin or y1 > (page_height - bottom_margin): - i += 1 - continue - - line_text = get_spaced_text_from_spans(spans).lower() - line_text_norm = normalize_text(line_text) - - # Combine with next line if available - if i + 1 < len(lines): - next_spans = lines[i + 1].get("spans", []) - next_line_text = get_spaced_text_from_spans(next_spans).lower() - combined_line_norm = normalize_text(line_text + " " + next_line_text) - else: - combined_line_norm = line_text_norm - - # Check if we should continue processing - if combined_line_norm and combined_line_norm in paths[0]: - print(combined_line_norm) - headertoContinue1 = combined_line_norm - if combined_line_norm and combined_line_norm in paths[-2]: - print(combined_line_norm) - headertoContinue2 = combined_line_norm - if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - # Optimized header matching - existsfull = ( - ( combined_line_norm in allchildrenheaders_set or - combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm - ) - - # New word-based matching - current_line_words = set(combined_line_norm.split()) - heading_words = set(heading_norm.split()) - all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0 - - substring_match = ( - heading_norm in combined_line_norm or - combined_line_norm in heading_norm or - all_words_match # Include the new word-based matching - ) - # substring_match = ( - # heading_norm in combined_line_norm or - # combined_line_norm in heading_norm - # ) - - if (substring_match and existsfull and not collecting and - len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ): - - # Check header conditions more efficiently - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - if header_spans and stringtowrite.startswith('To'): - Alltext_Tobebilled+=combined_line_norm - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - - - data_entry = { - "NBSLink": final_url, - "Subject": heading_to_search, - "Page": str(pageNumberFound), - "Author": "ADR", - "Creation Date": formatted_time, - "Layer": "Initial", - "Code": stringtowrite, - "head above 1": paths[-2], - "head above 2": paths[0], - "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename - } - data_list_JSON.append(data_entry) - - # Convert list to JSON - json_output = json.dumps(data_list_JSON, indent=4) - - # print("Final URL:", final_url) - i += 2 - continue - else: - if (substring_match and not collecting and - len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ): - - # Calculate word match percentage - word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100 - - # Check if at least 70% of header words exist in this line - meets_word_threshold = word_match_percent >= 100 - - # Check header conditions (including word threshold) - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - - if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'): - Alltext_Tobebilled+=combined_line_norm - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - # print(f"📥 Start collecting after header: {combined_line_norm} " - # f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - - - data_entry = { - "NBSLink": final_url, - "Subject": heading_to_search, - "Page": str(pageNumberFound), - "Author": "ADR", - "Creation Date": formatted_time, - "Layer": "Initial", - "Code": stringtowrite, - "head above 1": paths[-2], - "head above 2": paths[0], - "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename - } - data_list_JSON.append(data_entry) - - # Convert list to JSON - json_output = json.dumps(data_list_JSON, indent=4) - - # print("Final URL:", final_url) - i += 2 - continue - if collecting: - norm_line = normalize_text(line_text) - - # Optimized URL check - if url_pattern.match(norm_line): - line_is_header = False - else: - line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans) - - if line_is_header: - header_font_size = max(span["size"] for span in spans) - is_probably_real_header = ( - header_font_size >= matched_header_font_size and - is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and - len(line_text.strip()) > 2 - ) - - if (norm_line != matched_header_line_norm and - norm_line != heading_norm and - is_probably_real_header): - if line_text not in heading_norm: - # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})") - collecting = False - done = True - headertoContinue1 = False - headertoContinue2=False - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - break_collecting = True - break - - if break_collecting: - break - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], line_bbox[0]), - min(cb[1], line_bbox[1]), - max(cb[2], line_bbox[2]), - max(cb[3], line_bbox[3]) - ] - else: - current_bbox[page_num] = line_bbox - - last_y1s[page_num] = line_bbox[3] - i += 1 - - if not done: - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True) - - pdf_bytes = BytesIO() - docHighlights.save(pdf_bytes) - # print('JSONN',json_output) - return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_Tobebilled - +@app.route('/findapiFilteredHeadings', methods=['GET','POST']) +def findapiFilteredHeadings(): + try: + print('In process [Try]') + data = request.get_json() + # Extracting values + pdfLink = data.get('filePath') + print(pdfLink) + listofheadings = data.get('listofheadings') #in json format + print(listofheadings) + pdfbytes, pdf_document,tablepdfoutput= InitialMarkups.extract_section_under_headerRawan(pdfLink,listofheadings) + global jsonoutput + jsonoutput=tablepdfoutput + return jsonify(tablepdfoutput) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + + +################################################################################################################################################################ +################################################################################################################################################################ +##################### For Rawan - MC Connection ########################################################################################################### +################################################################################################################################################################ +################################################################################################################################################################ + +@app.route('/findapi', methods=['GET','POST']) +def findapi(): + try: + print('In process [Try]') + data = request.get_json() + # Extracting values + pdfLink = data.get('filePath') + pdfbytes, pdf_document,tablepdfoutput= InitialMarkups.extract_section_under_header(pdfLink) + global jsonoutput + jsonoutput=tablepdfoutput + return jsonify(tablepdfoutput) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +############################################# Testing ################################################# + +def finddata(): + pdfLink = 'https://www.dropbox.com/scl/fi/hnp4mqigb51a5kp89kgfa/00801-ARC-20-ZZ-S-A-0002.pdf?rlkey=45abeoebzqw4qwnslnei6dkd6&st=m4yrcjm2&dl=1' + keyword = ['115 INTEGRATED MRI ROOM LININGS', '310 ACCURACY'] + return pdfLink, keyword -######################################################################################################################################################## -######################################################################################################################################################## - -def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incomingheader=0): - top_margin = 70 - bottom_margin = 50 - # Optimized URL handling - if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path): - pdf_path = pdf_path.replace('dl=0', 'dl=1') - - # Cache frequently used values - response = requests.get(pdf_path) - pdf_content = BytesIO(response.content) - if not pdf_content: - raise ValueError("No valid PDF content found.") - - doc = fitz.open(stream=pdf_content, filetype="pdf") - docHighlights = fitz.open(stream=pdf_content, filetype="pdf") - most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc) - - # Precompute regex patterns - dot_pattern = re.compile(r'\.{3,}') - url_pattern = re.compile(r'https?://\S+|www\.\S+') - - def get_toc_page_numbers(doc, max_pages_to_check=15): - toc_pages = [] - for page_num in range(min(len(doc), max_pages_to_check)): - page = doc.load_page(page_num) - blocks = page.get_text("dict")["blocks"] - - dot_line_count = 0 - for block in blocks: - for line in block.get("lines", []): - line_text = get_spaced_text_from_spans(line["spans"]).strip() - if dot_pattern.search(line_text): - dot_line_count += 1 - - if dot_line_count >= 3: - toc_pages.append(page_num) - - return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages - - toc_pages = get_toc_page_numbers(doc) - - headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers( - doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin - ) - - listofheadingsfromrawan=[] - if type(headingjson) == str: - listofheadingsfromrawan.append(headingjson) - headingjson=[headingjson] - else: - for item in headingjson: - listofheadingsfromrawan.append(normalize_text(item['Subject'])) - print('hereeeeeeeeeeeeeee0',listofheadingsfromrawan) - # Precompute all children headers once - allchildrenheaders = listofheadingsfromrawan - print('hereeeeeeeeeeeeeee00',allchildrenheaders) - allchildrenheaders_set = set(allchildrenheaders) # For faster lookups - - df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"]) - data_list_JSON = [] - - if len(top_3_font_sizes)==3: - mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes - elif len(top_3_font_sizes)==2: - mainHeaderFontSize= top_3_font_sizes[0] - subHeaderFontSize= top_3_font_sizes[1] - subsubheaderFontSize= top_3_font_sizes[1] - - print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages) - - # Preload all pages to avoid repeated loading - # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages] - newjsonList=[] - for heading_to_searchDict in headingjson: - if type(heading_to_searchDict) == str: - heading_to_search = heading_to_searchDict - heading_to_searchPageNum = pagenum - else: - heading_to_search = heading_to_searchDict['Subject'] - heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1 - incomingheader = heading_to_searchDict['head above 1'] - - print('hereeeeeeeeeeeeeee0',heading_to_searchPageNum) - done = False - collecting = False - collected_lines = [] - page_highlights = {} - current_bbox = {} - last_y1s = {} - mainHeader = '' - subHeader = '' - matched_header_line_norm = heading_to_search - break_collecting = False - heading_norm = normalize_text(heading_to_search) - - for page_num in range(heading_to_searchPageNum,len(doc)): - print('hereeeeeeeeeeeeeee1') - if page_num in toc_pages: - continue - if break_collecting: - break - page=doc[page_num] - page_height = page.rect.height - blocks = page.get_text("dict")["blocks"] - - for block in blocks: - if break_collecting: - break - - lines = block.get("lines", []) - i = 0 - while i < len(lines): - if break_collecting: - break - - spans = lines[i].get("spans", []) - if not spans: - i += 1 - continue - - y0 = spans[0]["bbox"][1] - y1 = spans[0]["bbox"][3] - if y0 < top_margin or y1 > (page_height - bottom_margin): - i += 1 - continue - - line_text = get_spaced_text_from_spans(spans).lower() - line_text_norm = normalize_text(line_text) - - # Combine with next line if available - if i + 1 < len(lines): - next_spans = lines[i + 1].get("spans", []) - next_line_text = get_spaced_text_from_spans(next_spans).lower() - combined_line_norm = normalize_text(line_text + " " + next_line_text) - else: - combined_line_norm = line_text_norm - # Optimized header matching - existsfull = ( - ( combined_line_norm in allchildrenheaders_set or - combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm - ) - - # New word-based matching - current_line_words = set(combined_line_norm.split()) - heading_words = set(heading_norm.split()) - all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0 - - substring_match = ( - heading_norm in combined_line_norm or - combined_line_norm in heading_norm or - all_words_match # Include the new word-based matching - ) - - if (substring_match and existsfull and not collecting and - len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ): - - # Check header conditions more efficiently - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - if header_spans: - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - new_url= final_url - if type(heading_to_searchDict) != str: - heading_to_searchDict['NBSLink']=new_url - newjsonList.append(heading_to_searchDict) - print("Final URL:", final_url) - i += 2 - continue - else: - if (substring_match and not collecting and - len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ): - - # Calculate word match percentage - word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100 - - # Check if at least 70% of header words exist in this line - meets_word_threshold = word_match_percent >= 100 - - # Check header conditions (including word threshold) - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - - if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ): - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - print(f"📥 Start collecting after header: {combined_line_norm} " - f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - new_url= final_url - if type(heading_to_searchDict) != str: - heading_to_searchDict['NBSLink']=new_url - newjsonList.append(heading_to_searchDict) - print("Final URL:", final_url) - i += 2 - continue - if collecting: - norm_line = normalize_text(line_text) - - # Optimized URL check - if url_pattern.match(norm_line): - line_is_header = False - else: - line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans) - - if line_is_header: - header_font_size = max(span["size"] for span in spans) - is_probably_real_header = ( - header_font_size >= matched_header_font_size and - is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and - len(line_text.strip()) > 2 - ) - - if (norm_line != matched_header_line_norm and - norm_line != heading_norm and - is_probably_real_header): - if line_text not in heading_norm: - print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})") - collecting = False - done = True - headertoContinue1 = False - headertoContinue2=False - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - - if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - break_collecting = True - break - - if break_collecting: - break - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], line_bbox[0]), - min(cb[1], line_bbox[1]), - max(cb[2], line_bbox[2]), - max(cb[3], line_bbox[3]) - ] - else: - current_bbox[page_num] = line_bbox - - last_y1s[page_num] = line_bbox[3] - i += 1 - - if not done: - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True) - - pdf_bytes = BytesIO() - docHighlights.save(pdf_bytes) - return pdf_bytes.getvalue(), docHighlights , newjsonList - - - - - top_margin = 70 - bottom_margin = 50 - headertoContinue1 = False - headertoContinue2=False - - parsed_url = urlparse(pdf_path) - filename = os.path.basename(parsed_url.path) - filename = unquote(filename) # decode URL-encoded characters - - # Optimized URL handling - if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path): - pdf_path = pdf_path.replace('dl=0', 'dl=1') - - # Cache frequently used values - response = requests.get(pdf_path) - pdf_content = BytesIO(response.content) - if not pdf_content: - raise ValueError("No valid PDF content found.") - - doc = fitz.open(stream=pdf_content, filetype="pdf") - docHighlights = fitz.open(stream=pdf_content, filetype="pdf") - most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc) - - # Precompute regex patterns - dot_pattern = re.compile(r'\.{3,}') - url_pattern = re.compile(r'https?://\S+|www\.\S+') - - def get_toc_page_numbers(doc, max_pages_to_check=15): - toc_pages = [] - for page_num in range(min(len(doc), max_pages_to_check)): - page = doc.load_page(page_num) - blocks = page.get_text("dict")["blocks"] - - dot_line_count = 0 - for block in blocks: - for line in block.get("lines", []): - line_text = get_spaced_text_from_spans(line["spans"]).strip() - if dot_pattern.search(line_text): - dot_line_count += 1 - - if dot_line_count >= 3: - toc_pages.append(page_num) - - return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages - - toc_pages = get_toc_page_numbers(doc) - - headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers( - doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin - ) - - hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font) - listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy) - print('listofHeaderstoMarkup',listofHeaderstoMarkup) - # Precompute all children headers once - allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup] - allchildrenheaders_set = set(allchildrenheaders) # For faster lookups - - df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"]) - dictionaryNBS={} - data_list_JSON = [] - - if len(top_3_font_sizes)==3: - mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes - elif len(top_3_font_sizes)==2: - mainHeaderFontSize= top_3_font_sizes[0] - subHeaderFontSize= top_3_font_sizes[1] - subsubheaderFontSize= top_3_font_sizes[1] - - print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages) - - # Preload all pages to avoid repeated loading - # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages] - - for heading_to_searchDict, paths in listofHeaderstoMarkup: - heading_to_search = heading_to_searchDict['text'] - heading_to_searchPageNum = heading_to_searchDict['page'] - - print('headertosearch', heading_to_search) - - # Initialize variables - headertoContinue1 = False - headertoContinue2 = False - matched_header_line = None - done = False - collecting = False - collected_lines = [] - page_highlights = {} - current_bbox = {} - last_y1s = {} - mainHeader = '' - subHeader = '' - matched_header_line_norm = heading_to_search - break_collecting = False - heading_norm = normalize_text(heading_to_search) - paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else [] - - for page_num in range(heading_to_searchPageNum,len(doc)): - if page_num in toc_pages: - continue - if break_collecting: - break - page=doc[page_num] - page_height = page.rect.height - blocks = page.get_text("dict")["blocks"] - - for block in blocks: - if break_collecting: - break - - lines = block.get("lines", []) - i = 0 - while i < len(lines): - if break_collecting: - break - - spans = lines[i].get("spans", []) - if not spans: - i += 1 - continue - - y0 = spans[0]["bbox"][1] - y1 = spans[0]["bbox"][3] - if y0 < top_margin or y1 > (page_height - bottom_margin): - i += 1 - continue - - line_text = get_spaced_text_from_spans(spans).lower() - line_text_norm = normalize_text(line_text) - - # Combine with next line if available - if i + 1 < len(lines): - next_spans = lines[i + 1].get("spans", []) - next_line_text = get_spaced_text_from_spans(next_spans).lower() - combined_line_norm = normalize_text(line_text + " " + next_line_text) - else: - combined_line_norm = line_text_norm - - # Check if we should continue processing - if combined_line_norm and combined_line_norm in paths[0]: - print(combined_line_norm) - headertoContinue1 = combined_line_norm - if combined_line_norm and combined_line_norm in paths[-2]: - print(combined_line_norm) - headertoContinue2 = combined_line_norm - if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - # Optimized header matching - existsfull = ( - ( combined_line_norm in allchildrenheaders_set or - combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm - ) - - # New word-based matching - current_line_words = set(combined_line_norm.split()) - heading_words = set(heading_norm.split()) - all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0 - - substring_match = ( - heading_norm in combined_line_norm or - combined_line_norm in heading_norm or - all_words_match # Include the new word-based matching - ) - # substring_match = ( - # heading_norm in combined_line_norm or - # combined_line_norm in heading_norm - # ) - - if (substring_match and existsfull and not collecting and - len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ): - - # Check header conditions more efficiently - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - if header_spans: - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - - - data_entry = { - "NBSLink": final_url, - "Subject": heading_to_search, - "Page": str(pageNumberFound), - "Author": "ADR", - "Creation Date": formatted_time, - "Layer": "Initial", - "Code": stringtowrite, - "head above 1": paths[-2], - "head above 2": paths[0], - "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename - } - data_list_JSON.append(data_entry) - - # Convert list to JSON - json_output = json.dumps(data_list_JSON, indent=4) - - print("Final URL:", final_url) - i += 2 - continue - else: - if (substring_match and not collecting and - len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ): - - # Calculate word match percentage - word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100 - - # Check if at least 70% of header words exist in this line - meets_word_threshold = word_match_percent >= 100 - - # Check header conditions (including word threshold) - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - - if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ): - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - print(f"📥 Start collecting after header: {combined_line_norm} " - f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - - - data_entry = { - "NBSLink": final_url, - "Subject": heading_to_search, - "Page": str(pageNumberFound), - "Author": "ADR", - "Creation Date": formatted_time, - "Layer": "Initial", - "Code": stringtowrite, - "head above 1": paths[-2], - "head above 2": paths[0], - "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename - } - data_list_JSON.append(data_entry) - - # Convert list to JSON - json_output = json.dumps(data_list_JSON, indent=4) - - print("Final URL:", final_url) - i += 2 - continue - if collecting: - norm_line = normalize_text(line_text) - - # Optimized URL check - if url_pattern.match(norm_line): - line_is_header = False - else: - line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans) - - if line_is_header: - header_font_size = max(span["size"] for span in spans) - is_probably_real_header = ( - header_font_size >= matched_header_font_size and - is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and - len(line_text.strip()) > 2 - ) - - if (norm_line != matched_header_line_norm and - norm_line != heading_norm and - is_probably_real_header): - if line_text not in heading_norm: - print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})") - collecting = False - done = True - headertoContinue1 = False - headertoContinue2=False - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - break_collecting = True - break - - if break_collecting: - break - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], line_bbox[0]), - min(cb[1], line_bbox[1]), - max(cb[2], line_bbox[2]), - max(cb[3], line_bbox[3]) - ] - else: - current_bbox[page_num] = line_bbox - - last_y1s[page_num] = line_bbox[3] - i += 1 - - if not done: - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True) - - pdf_bytes = BytesIO() - docHighlights.save(pdf_bytes) - print('JSONN',json_output) - return pdf_bytes.getvalue(), docHighlights , json_output - - - - -######################################################################################################################################################## -######################################################################################################################################################## - - -def extract_section_under_header_tobebilledOnly(pdf_path): - Alltext_tobebilled='' - top_margin = 70 - bottom_margin = 50 - headertoContinue1 = False - headertoContinue2=False - - parsed_url = urlparse(pdf_path) - filename = os.path.basename(parsed_url.path) - filename = unquote(filename) # decode URL-encoded characters - - # Optimized URL handling - if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path): - pdf_path = pdf_path.replace('dl=0', 'dl=1') - - # Cache frequently used values - response = requests.get(pdf_path) - pdf_content = BytesIO(response.content) - if not pdf_content: - raise ValueError("No valid PDF content found.") - - doc = fitz.open(stream=pdf_content, filetype="pdf") - docHighlights = fitz.open(stream=pdf_content, filetype="pdf") - most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc) - - # Precompute regex patterns - dot_pattern = re.compile(r'\.{3,}') - url_pattern = re.compile(r'https?://\S+|www\.\S+') - - def get_toc_page_numbers(doc, max_pages_to_check=15): - toc_pages = [] - for page_num in range(min(len(doc), max_pages_to_check)): - page = doc.load_page(page_num) - blocks = page.get_text("dict")["blocks"] - - dot_line_count = 0 - for block in blocks: - for line in block.get("lines", []): - line_text = get_spaced_text_from_spans(line["spans"]).strip() - if dot_pattern.search(line_text): - dot_line_count += 1 - - if dot_line_count >= 3: - toc_pages.append(page_num) - - return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages - - toc_pages = get_toc_page_numbers(doc) - - headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers( - doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin - ) - - hierarchy = build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font) - listofHeaderstoMarkup = get_leaf_headers_with_paths(hierarchy) - # print('listofHeaderstoMarkup',listofHeaderstoMarkup) - # Precompute all children headers once - allchildrenheaders = [normalize_text(item['text']) for item, p in listofHeaderstoMarkup] - allchildrenheaders_set = set(allchildrenheaders) # For faster lookups - - df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"]) - dictionaryNBS={} - data_list_JSON = [] - - if len(top_3_font_sizes)==3: - mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes - elif len(top_3_font_sizes)==2: - mainHeaderFontSize= top_3_font_sizes[0] - subHeaderFontSize= top_3_font_sizes[1] - subsubheaderFontSize= top_3_font_sizes[1] - - # print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages) - - # Preload all pages to avoid repeated loading - # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages] - - for heading_to_searchDict, paths in listofHeaderstoMarkup: - heading_to_search = heading_to_searchDict['text'] - heading_to_searchPageNum = heading_to_searchDict['page'] - - # print('headertosearch', heading_to_search) - - # Initialize variables - headertoContinue1 = False - headertoContinue2 = False - matched_header_line = None - done = False - collecting = False - collected_lines = [] - page_highlights = {} - current_bbox = {} - last_y1s = {} - mainHeader = '' - subHeader = '' - matched_header_line_norm = heading_to_search - break_collecting = False - heading_norm = normalize_text(heading_to_search) - paths_norm = [normalize_text(p) for p in paths[0]] if paths and paths[0] else [] - - for page_num in range(heading_to_searchPageNum,len(doc)): - if page_num in toc_pages: - continue - if break_collecting: - break - page=doc[page_num] - page_height = page.rect.height - blocks = page.get_text("dict")["blocks"] - - for block in blocks: - if break_collecting: - break - - lines = block.get("lines", []) - i = 0 - while i < len(lines): - if break_collecting: - break - - spans = lines[i].get("spans", []) - if not spans: - i += 1 - continue - - y0 = spans[0]["bbox"][1] - y1 = spans[0]["bbox"][3] - if y0 < top_margin or y1 > (page_height - bottom_margin): - i += 1 - continue - - line_text = get_spaced_text_from_spans(spans).lower() - line_text_norm = normalize_text(line_text) - - # Combine with next line if available - if i + 1 < len(lines): - next_spans = lines[i + 1].get("spans", []) - next_line_text = get_spaced_text_from_spans(next_spans).lower() - combined_line_norm = normalize_text(line_text + " " + next_line_text) - else: - combined_line_norm = line_text_norm - - # Check if we should continue processing - if combined_line_norm and combined_line_norm in paths[0]: - print(combined_line_norm) - headertoContinue1 = combined_line_norm - if combined_line_norm and combined_line_norm in paths[-2]: - print(combined_line_norm) - headertoContinue2 = combined_line_norm - if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - # Optimized header matching - existsfull = ( - ( combined_line_norm in allchildrenheaders_set or - combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm - ) - - # New word-based matching - current_line_words = set(combined_line_norm.split()) - heading_words = set(heading_norm.split()) - all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0 - - substring_match = ( - heading_norm in combined_line_norm or - combined_line_norm in heading_norm or - all_words_match # Include the new word-based matching - ) - # substring_match = ( - # heading_norm in combined_line_norm or - # combined_line_norm in heading_norm - # ) - - if (substring_match and existsfull and not collecting and - len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ): - - # Check header conditions more efficiently - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - if header_spans and stringtowrite.startswith('To'): - Alltext_tobebilled+=combined_line_norm - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - # print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - - - data_entry = { - "NBSLink": final_url, - "Subject": heading_to_search, - "Page": str(pageNumberFound), - "Author": "ADR", - "Creation Date": formatted_time, - "Layer": "Initial", - "Code": stringtowrite, - "head above 1": paths[-2], - "head above 2": paths[0], - "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename - } - data_list_JSON.append(data_entry) - - # Convert list to JSON - json_output = json.dumps(data_list_JSON, indent=4) - - # print("Final URL:", final_url) - i += 2 - continue - else: - if (substring_match and not collecting and - len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ): - - # Calculate word match percentage - word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100 - - # Check if at least 70% of header words exist in this line - meets_word_threshold = word_match_percent >= 100 - - # Check header conditions (including word threshold) - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - - if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ) and stringtowrite.startswith('To'): - Alltext_tobebilled+=combined_line_norm - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - # print(f"📥 Start collecting after header: {combined_line_norm} " - # f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{tobebilledonlyLink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - - - data_entry = { - "NBSLink": final_url, - "Subject": heading_to_search, - "Page": str(pageNumberFound), - "Author": "ADR", - "Creation Date": formatted_time, - "Layer": "Initial", - "Code": stringtowrite, - "head above 1": paths[-2], - "head above 2": paths[0], - "MC Connnection": 'Go to ' + paths[0].strip().split()[0] +'/'+ heading_to_search.strip().split()[0] + ' in '+ filename - } - data_list_JSON.append(data_entry) - - # Convert list to JSON - json_output = json.dumps(data_list_JSON, indent=4) - - print("Final URL:", final_url) - i += 2 - continue - if collecting: - norm_line = normalize_text(line_text) - - # Optimized URL check - if url_pattern.match(norm_line): - line_is_header = False - else: - line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans) - - if line_is_header: - header_font_size = max(span["size"] for span in spans) - is_probably_real_header = ( - header_font_size >= matched_header_font_size and - is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and - len(line_text.strip()) > 2 - ) - - if (norm_line != matched_header_line_norm and - norm_line != heading_norm and - is_probably_real_header): - if line_text not in heading_norm: - # print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})") - collecting = False - done = True - headertoContinue1 = False - headertoContinue2=False - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - break_collecting = True - break - - if break_collecting: - break - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], line_bbox[0]), - min(cb[1], line_bbox[1]), - max(cb[2], line_bbox[2]), - max(cb[3], line_bbox[3]) - ] - else: - current_bbox[page_num] = line_bbox - - last_y1s[page_num] = line_bbox[3] - i += 1 - - if not done: - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - if 'installation' in paths[-2].lower() or 'execution' in paths[-2].lower() or 'miscellaneous items' in paths[-2].lower() : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True) - - pdf_bytes = BytesIO() - docHighlights.save(pdf_bytes) - # print('JSONN',json_output) - return pdf_bytes.getvalue(), docHighlights , json_output , Alltext_tobebilled - - - -######################################################################################################################################################## -######################################################################################################################################################## - -def extract_section_under_headerRawan(pdf_path,headingjson,pagenum=0,incomingheader=0): - top_margin = 70 - bottom_margin = 50 - # Optimized URL handling - if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path): - pdf_path = pdf_path.replace('dl=0', 'dl=1') - - # Cache frequently used values - response = requests.get(pdf_path) - pdf_content = BytesIO(response.content) - if not pdf_content: - raise ValueError("No valid PDF content found.") - - doc = fitz.open(stream=pdf_content, filetype="pdf") - docHighlights = fitz.open(stream=pdf_content, filetype="pdf") - most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc) - - # Precompute regex patterns - dot_pattern = re.compile(r'\.{3,}') - url_pattern = re.compile(r'https?://\S+|www\.\S+') - - def get_toc_page_numbers(doc, max_pages_to_check=15): - toc_pages = [] - for page_num in range(min(len(doc), max_pages_to_check)): - page = doc.load_page(page_num) - blocks = page.get_text("dict")["blocks"] - - dot_line_count = 0 - for block in blocks: - for line in block.get("lines", []): - line_text = get_spaced_text_from_spans(line["spans"]).strip() - if dot_pattern.search(line_text): - dot_line_count += 1 - - if dot_line_count >= 3: - toc_pages.append(page_num) - - return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages - - toc_pages = get_toc_page_numbers(doc) - - headers, top_3_font_sizes, smallest_font_size, headersSpans = extract_headers( - doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin - ) - - listofheadingsfromrawan=[] - if type(headingjson) == str: - listofheadingsfromrawan.append(headingjson) - headingjson=[headingjson] - else: - for item in headingjson: - listofheadingsfromrawan.append(normalize_text(item['Subject'])) - print('hereeeeeeeeeeeeeee0',listofheadingsfromrawan) - # Precompute all children headers once - allchildrenheaders = listofheadingsfromrawan - print('hereeeeeeeeeeeeeee00',allchildrenheaders) - allchildrenheaders_set = set(allchildrenheaders) # For faster lookups - - df = pd.DataFrame(columns=["NBSLink","Subject","Page","Author","Creation Date","Layer",'Code', 'head above 1', "head above 2"]) - data_list_JSON = [] - - if len(top_3_font_sizes)==3: - mainHeaderFontSize, subHeaderFontSize, subsubheaderFontSize = top_3_font_sizes - elif len(top_3_font_sizes)==2: - mainHeaderFontSize= top_3_font_sizes[0] - subHeaderFontSize= top_3_font_sizes[1] - subsubheaderFontSize= top_3_font_sizes[1] - - print("📌 Has TOC:", bool(toc_pages), " | Pages to skip:", toc_pages) - - # Preload all pages to avoid repeated loading - # pages = [doc.load_page(page_num) for page_num in range(len(doc)) if page_num not in toc_pages] - newjsonList=[] - for heading_to_searchDict in headingjson: - if type(heading_to_searchDict) == str: - heading_to_search = heading_to_searchDict - heading_to_searchPageNum = pagenum - else: - heading_to_search = heading_to_searchDict['Subject'] - heading_to_searchPageNum = int(heading_to_searchDict['Page'])-1 - incomingheader = heading_to_searchDict['head above 1'] - - print('hereeeeeeeeeeeeeee0',heading_to_searchPageNum) - done = False - collecting = False - collected_lines = [] - page_highlights = {} - current_bbox = {} - last_y1s = {} - mainHeader = '' - subHeader = '' - matched_header_line_norm = heading_to_search - break_collecting = False - heading_norm = normalize_text(heading_to_search) - - for page_num in range(heading_to_searchPageNum,len(doc)): - print('hereeeeeeeeeeeeeee1') - if page_num in toc_pages: - continue - if break_collecting: - break - page=doc[page_num] - page_height = page.rect.height - blocks = page.get_text("dict")["blocks"] - - for block in blocks: - if break_collecting: - break - - lines = block.get("lines", []) - i = 0 - while i < len(lines): - if break_collecting: - break - - spans = lines[i].get("spans", []) - if not spans: - i += 1 - continue - - y0 = spans[0]["bbox"][1] - y1 = spans[0]["bbox"][3] - if y0 < top_margin or y1 > (page_height - bottom_margin): - i += 1 - continue - - line_text = get_spaced_text_from_spans(spans).lower() - line_text_norm = normalize_text(line_text) - - # Combine with next line if available - if i + 1 < len(lines): - next_spans = lines[i + 1].get("spans", []) - next_line_text = get_spaced_text_from_spans(next_spans).lower() - combined_line_norm = normalize_text(line_text + " " + next_line_text) - else: - combined_line_norm = line_text_norm - # Optimized header matching - existsfull = ( - ( combined_line_norm in allchildrenheaders_set or - combined_line_norm in allchildrenheaders ) and heading_to_search in combined_line_norm - ) - - # New word-based matching - current_line_words = set(combined_line_norm.split()) - heading_words = set(heading_norm.split()) - all_words_match = current_line_words.issubset(heading_words) and len(current_line_words) > 0 - - substring_match = ( - heading_norm in combined_line_norm or - combined_line_norm in heading_norm or - all_words_match # Include the new word-based matching - ) - - if (substring_match and existsfull and not collecting and - len(combined_line_norm) > 0 ):#and (headertoContinue1 or headertoContinue2) ): - - # Check header conditions more efficiently - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - if header_spans: - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - print(f"📥 Start collecting after header: {combined_line_norm} (Font size: {matched_header_font_size})") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - - # Get current date and time - now = datetime.now() - - # Format the output - formatted_time = now.strftime("%d/%m/%Y %I:%M:%S %p") - # Optionally, add the URL to a DataFrame - new_url= final_url - if type(heading_to_searchDict) != str: - heading_to_searchDict['NBSLink']=new_url - newjsonList.append(heading_to_searchDict) - print("Final URL:", final_url) - i += 2 - continue - else: - if (substring_match and not collecting and - len(combined_line_norm) > 0): # and (headertoContinue1 or headertoContinue2) ): - - # Calculate word match percentage - word_match_percent = words_match_ratio(heading_norm, combined_line_norm) * 100 - - # Check if at least 70% of header words exist in this line - meets_word_threshold = word_match_percent >= 100 - - # Check header conditions (including word threshold) - header_spans = [ - span for span in spans - if (is_header(span, most_common_font_size, most_common_color, most_common_font) - # and span['size'] >= subsubheaderFontSize - and span['size'] < mainHeaderFontSize) - ] - - if header_spans and (meets_word_threshold or same_start_word(heading_to_search, combined_line_norm) ): - collecting = True - matched_header_font_size = max(span["size"] for span in header_spans) - print(f"📥 Start collecting after header: {combined_line_norm} " - f"(Font: {matched_header_font_size}, Word match: {word_match_percent:.0f}%)") - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - header_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], header_bbox[0]), - min(cb[1], header_bbox[1]), - max(cb[2], header_bbox[2]), - max(cb[3], header_bbox[3]) - ] - else: - current_bbox[page_num] = header_bbox - - last_y1s[page_num] = header_bbox[3] - x0, y0, x1, y1 = header_bbox - zoom = 200 - left = int(x0) - top = int(y0) - zoom_str = f"{zoom},{left},{top}" - pageNumberFound = page_num + 1 - - # Build the query parameters - params = { - 'pdfLink': pdf_path, # Your PDF link - 'keyword': heading_to_search, # Your keyword (could be a string or list) - } - - # URL encode each parameter - encoded_params = {key: urllib.parse.quote(value, safe='') for key, value in params.items()} - - # Construct the final encoded link - encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()]) - - # Correctly construct the final URL with page and zoom - final_url = f"{newlink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}" - new_url= final_url - if type(heading_to_searchDict) != str: - heading_to_searchDict['NBSLink']=new_url - newjsonList.append(heading_to_searchDict) - print("Final URL:", final_url) - i += 2 - continue - if collecting: - norm_line = normalize_text(line_text) - - # Optimized URL check - if url_pattern.match(norm_line): - line_is_header = False - else: - line_is_header = any(is_header(span, most_common_font_size, most_common_color, most_common_font) for span in spans) - - if line_is_header: - header_font_size = max(span["size"] for span in spans) - is_probably_real_header = ( - header_font_size >= matched_header_font_size and - is_header(spans[0], most_common_font_size, most_common_color, most_common_font) and - len(line_text.strip()) > 2 - ) - - if (norm_line != matched_header_line_norm and - norm_line != heading_norm and - is_probably_real_header): - if line_text not in heading_norm: - print(f"🛑 Stop at header with same or larger font: '{line_text}' ({header_font_size} ≥ {matched_header_font_size})") - collecting = False - done = True - headertoContinue1 = False - headertoContinue2=False - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - - if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - highlight_boxes(docHighlights, page_highlights,stringtowrite) - - break_collecting = True - break - - if break_collecting: - break - - collected_lines.append(line_text) - valid_spans = [span for span in spans if span.get("bbox")] - if valid_spans: - x0s = [span["bbox"][0] for span in valid_spans] - x1s = [span["bbox"][2] for span in valid_spans] - y0s = [span["bbox"][1] for span in valid_spans] - y1s = [span["bbox"][3] for span in valid_spans] - - line_bbox = [min(x0s), min(y0s), max(x1s), max(y1s)] - - if page_num in current_bbox: - cb = current_bbox[page_num] - current_bbox[page_num] = [ - min(cb[0], line_bbox[0]), - min(cb[1], line_bbox[1]), - max(cb[2], line_bbox[2]), - max(cb[3], line_bbox[3]) - ] - else: - current_bbox[page_num] = line_bbox - - last_y1s[page_num] = line_bbox[3] - i += 1 - - if not done: - for page_num, bbox in current_bbox.items(): - bbox[3] = last_y1s.get(page_num, bbox[3]) - page_highlights[page_num] = bbox - if 'installation' in incomingheader or 'execution' in incomingheader or 'miscellaneous items' in incomingheader : - stringtowrite='Not to be billed' - else: - stringtowrite='To be billed' - highlight_boxes(docHighlights, page_highlights,stringtowrite) +########################################### Running ##################################################### +#_________________________________________________________________________________________________________________________ +#_________________________________________________________________________________________________________________________ - # docHighlights.save("highlighted_output.pdf", garbage=4, deflate=True) +#_________________________________________________________________________________________________________________________ +#_________________________________________________________________________________________________________________________ - pdf_bytes = BytesIO() - docHighlights.save(pdf_bytes) - return pdf_bytes.getvalue(), docHighlights , newjsonList +#_________________________________________________________________________________________________________________________ +#_________________________________________________________________________________________________________________________ - +if __name__ == '__main__': + app.run(host='0.0.0.0', port=7860) + \ No newline at end of file