Spaces:

chandanzeon
/

DoclingAPI

Sleeping

App Files Files Community

chandanzeon commited on Feb 10, 2025

Commit

329ee91

1 Parent(s): 911e9ef

First Commit

Browse files

Files changed (5) hide show

.gitignore +1 -0
Dockerfile +14 -0
app.py +105 -0
helper.py +295 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.ipynb

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+# Use the official Python 3.10.12 image
+FROM python:3.10.12
+# Copy the current directory contents into the container at .
+COPY . .
+# Set the working directory to /
+WORKDIR /
+# Install requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /requirements.txt
+# Start the FastAPI app on port 7860, the default port expected by Spaces
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from fastapi import FastAPI, HTTPException
+from dotenv import load_dotenv
+import boto3
+import os
+import uvicorn
+import logging
+from uuid import uuid4
+from pydantic import BaseModel
+from helper import PdfToSectionConverter
+# Load environment variables
+load_dotenv()
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Fetch AWS credentials from environment
+s3_access_key_id = os.getenv("S3_ACCESS_KEY_ID")
+s3_secret_key = os.getenv("S3_SECRET_KEY")
+aws_region = os.getenv("AWS_REGION")
+# Validate environment variables
+if not all([s3_access_key_id, s3_secret_key, aws_region]):
+    logger.error("Missing AWS S3 credentials in environment variables.")
+    raise ValueError("AWS credentials not set properly.")
+# Initialize FastAPI app
+app = FastAPI()
+# Configure S3 client
+s3_client = boto3.client(
+    "s3",
+    aws_access_key_id=s3_access_key_id,
+    aws_secret_access_key=s3_secret_key,
+    region_name=aws_region,
+)
+class PdfRequest(BaseModel):
+    s3_file_path: str
+    file_title: str
+    doc_id : str
+    start_page: int = 0
+    end_page: int = 0
+@app.get("/")
+async def start():
+    return {"message": "Parser API is Ready"}
+@app.post("/convert_pdf")
+async def convert_pdf(request: PdfRequest):
+    try:
+        output_dir = "/tmp"
+        output_path = os.path.join(output_dir, "temp_file.pdf")
+        doc_id = request.doc_id
+        # Ensure the directory exists
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir, exist_ok=True)
+        # Validate S3 file path
+        if not request.s3_file_path.startswith("s3://"):
+            raise HTTPException(status_code=400, detail="Invalid S3 file path. Must start with 's3://'")
+        try:
+            bucket_name, object_key = request.s3_file_path.replace("s3://", "").split("/", 1)
+        except ValueError:
+            raise HTTPException(status_code=400, detail="Invalid S3 file path format.")
+        logger.info(f"Downloading {request.s3_file_path} from S3 bucket {bucket_name}...")
+        # Download PDF from S3
+        try:
+            s3_client.download_file(bucket_name, object_key, output_path)
+        except Exception as e:
+            logger.error(f"Failed to download file from S3: {str(e)}")
+            raise HTTPException(status_code=500, detail="Error downloading file from S3.")
+        # Initialize and run the converter
+        converter = PdfToSectionConverter()
+        output = converter.convert(
+            downloaded_pdf_path=output_path,
+            file_title=request.file_title,
+            doc_id=doc_id,
+            start_page_no=request.start_page,
+            end_page_no=request.end_page
+        )
+        # Cleanup the temporary file
+        os.remove(output_path)
+        return {"status": "success", "data": output}
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Unexpected error: {str(e)}")
+        raise HTTPException(status_code=500, detail="Internal Server Error.")
+def start_server():
+    logger.info("Starting Server...")
+    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
+if __name__ == "__main__":
+    start_server()

helper.py ADDED Viewed

	@@ -0,0 +1,295 @@

+from docling.document_converter import DocumentConverter
+import logging
+import re
+from uuid import uuid4
+from typing import List, Optional, Generator, Set
+from functools import partial, reduce
+from itertools import chain
+from PyPDF2 import PdfReader, PdfWriter
+tag_list = ["Sources:", "Source:", "Tags-", "Tags:", "CONTENTS", "ANNEX", "EXERCISES", "Project/Activity"]
+logger = logging.getLogger(__name__)
+import os
+try:
+    converter = DocumentConverter()
+except Exception as e:
+    logger.error(f"Error initializing Docling DocumentConverter: {e}")
+def split_pdf(input_pdf, output_pdf, start_page, end_page):
+    reader = PdfReader(input_pdf)
+    writer = PdfWriter()
+    for i in range(start_page, end_page+1):
+        writer.add_page(reader.pages[i])
+    with open(output_pdf, "wb") as output_file:
+        writer.write(output_file)
+    print(f"PDF split successfully: {output_pdf}")
+def get_texts(res):
+    page_texts = {pg:"" for pg in res['pages'].keys()}
+    texts = res.get('texts')
+    for item in texts:
+        for prov in item['prov']:
+            page_no = prov['page_no']
+            text = item['text']
+            page_key = f'{page_no}'
+            if page_key not in page_texts:
+                page_texts[page_key] = text
+            else:
+                page_texts[page_key] += ' ' + text
+    return page_texts
+def clean_the_text(text):
+    """
+    Cleans the extracted text by removing unnecessary characters and formatting issues.
+    Args:
+        text (str): The extracted text.
+    Returns:
+        str: The cleaned text.
+    """
+    try:
+        text = re.sub(r'\n\s*\n', '\n', text)
+        text = text.replace("\t", " ")
+        text = text.replace("\f", " ")
+        text = re.sub(r'\b(\w+\s*)\1{1,}', '\\1', text)
+        text = re.sub(r'[^a-zA-Z0-9\s@\-/,.\\]', ' ', text)
+        return text.strip()
+    except Exception as e:
+        logger.error(f"Error cleaning text: {e}")
+        return text
+def get_tables(res_json):
+    page_tables = {pg:[] for pg in res_json['pages'].keys()}
+    try:
+        tables = res_json.get('tables', [])
+        if not isinstance(tables, list):
+            raise ValueError("Expected 'tables' to be a list.")
+        for table in tables:
+            try:
+                # Ensure 'prov' exists and has the necessary structure
+                prov = table.get('prov', [])
+                if not prov or not isinstance(prov, list):
+                    raise ValueError("Missing or invalid 'prov' structure in table.")
+                page_no = str(prov[0].get('page_no'))
+                if not page_no:
+                    raise ValueError("Missing or invalid 'page_no' in 'prov'.")
+                # Ensure 'data' and 'grid' exist
+                data = table.get('data', {})
+                grid = data.get('grid', [])
+                if not isinstance(grid, list):
+                    raise ValueError("Missing or invalid 'grid' structure in 'data'.")
+                # Add text to page_texts
+                page_tables[f'{page_no}'].append(grid)
+            except Exception as table_error:
+                print(f"Error processing table: {table_error}")
+    except Exception as e:
+        print(f"Error processing tables: {e}")
+    return page_tables
+def table_to_text_or_json(table, rtrn_type="text"):
+    """
+    Converts a table to a single string or JSON format.
+    Args:
+        table (dict): The table object to convert.
+        rtrn_type (str): The return type, either "text" or "json". Default is "text".
+    Returns:
+        str: The table converted to the specified format.
+    """
+    table_text = "Here is a Table : \n"
+    for row in table:
+        for col in row:
+            val = col.get('text')
+            table_text+=f'{val} ,'
+        table_text+='\n'
+    return table_text
+def clean_file_name(text: str):
+    """
+    Cleans the file name by removing any special characters.
+    Args:
+        text (str): The original file name.
+    Returns:
+        str: The cleaned file name.
+    """
+    try:
+        text = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text)
+        return text
+    except Exception as e:
+        logger.error(f"Error cleaning file name: {e}")
+        return text
+def find_and_remove_header_footer(
+    text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
+) -> str:
+    """
+    Heuristic to find footers and headers across different pages by searching for the longest common string.
+    For headers we only search in the first n_chars characters (for footer: last n_chars).
+    Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
+        but won't detect "Page 3 of 4" or similar.
+    :param n_chars: number of first/last characters where the header/footer shall be searched in
+    :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
+    :param n_last_pages_to_ignore: number of last pages to ignore
+    :return: (cleaned pages, found_header_str, found_footer_str)
+    """
+    pages = text.split("\f")
+    # header
+    start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
+    found_header = find_longest_common_ngram(start_of_pages)
+    if found_header:
+        pages = [page.replace(found_header, "") for page in pages]
+    # footer
+    end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
+    found_footer = find_longest_common_ngram(end_of_pages)
+    if found_footer:
+        pages = [page.replace(found_footer, "") for page in pages]
+    logger.debug(f"Removed header '{found_header}' and footer '{found_footer}' in document")
+    text = "\f".join(pages)
+    return text
+def ngram(self, seq: str, n: int) -> Generator[str, None, None]:
+    """
+    Return ngram (of tokens - currently split by whitespace)
+    :param seq: str, string from which the ngram shall be created
+    :param n: int, n of ngram
+    :return: str, ngram as string
+    """
+    # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
+    # we add a space here and remove it after creation of the ngrams again (see below)
+    seq = seq.replace("\n", " \n")
+    seq = seq.replace("\t", " \t")
+    words = seq.split(" ")
+    ngrams = (
+        " ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
+    )
+    return ngrams
+def allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
+    lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
+    ngrams = map(partial(self.ngram, seq), lengths)
+    res = set(chain.from_iterable(ngrams))
+    return res
+def find_longest_common_ngram(
+    sequences: List[str], max_ngram: int = 30, min_ngram: int = 3
+) -> Optional[str]:
+    """
+    Find the longest common ngram across different text sequences (e.g. start of pages).
+    Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
+    :param sequences: list[str], list of strings that shall be searched for common n_grams
+    :param max_ngram: int, maximum length of ngram to consider
+    :param min_ngram: minimum length of ngram to consider
+    :return: str, common string of all sections
+    """
+    sequences = [s for s in sequences if s]  # filter empty sequences
+    if not sequences:
+        return None
+    seqs_ngrams = map(partial(allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
+    intersection = reduce(set.intersection, seqs_ngrams)
+    try:
+        longest = max(intersection, key=len)
+    except ValueError:
+        # no common sequence found
+        longest = ""
+    return longest if longest.strip() else None
+class PdfToSectionConverter():
+    def __int__(self):
+        """
+        Initializes the PdfToSectionConverter class.
+        """
+        pass
+    def convert(self, downloaded_pdf_path: str, file_title: str, doc_id: str = None, start_page_no: int = 0,
+                end_page_no: int = 0):
+        """
+        Converts a PDF document to sections with metadata.
+        Args:
+            doc_obj (BytesIO): The PDF document object.
+            downloaded_pdf_path (str): Path to the downloaded PDF file.
+            file_title (str): The title of the file.
+            doc_id (str, optional): The document ID. Defaults to None.
+            start_page_no (int, optional): The starting page number. Defaults to 0.
+            end_page_no (int, optional): The ending page number. Defaults to 0.
+        Returns:
+            list: A list of dictionaries containing sections and metadata.
+        """
+        try:
+            print(f"Splitting pdf from page {start_page_no+1} to {end_page_no+1}")
+            output_path = "/tmp/splitted.pdf"
+            split_pdf(downloaded_pdf_path, output_path, start_page_no, end_page_no)
+            print("OCR Started ....")
+            result = converter.convert(output_path)
+            json_objects = result.document.export_to_dict()
+            pages = list(json_objects['pages'].keys())
+            texts = get_texts(json_objects)
+            tables = get_tables(json_objects)
+        except Exception as e:
+            logger.error(f"Error getting JSON result from parser: {e}")
+            return []
+        output_doc_lst = []
+        page_no = start_page_no
+        try:
+            for page in pages:
+                if page_no > end_page_no:
+                    break
+                page_no += 1
+                print(f"Page Number to be processed: {page_no}")
+                meta = {"doc_id": doc_id, "page_no": page_no, "img_count": 0, "img_lst": []}
+                meta_table = {"doc_id": doc_id, "page_no": page_no, "img_count": 0, "img_lst": "[]"}
+                # Extract text from the page
+                text_to_append = texts[page]
+                text_to_append = clean_the_text(text_to_append)
+                # Detect and extract tables
+                tables_to_append = tables[page]
+                if tables_to_append:
+                    tables_to_append = [table_to_text_or_json(table=i, rtrn_type="text") for i in tables_to_append]
+                # Add the processed section to the output list
+                output_doc_lst.append(
+                        {"doc_id": doc_id, "text": text_to_append, "vector_id": str(uuid4()),
+                         "meta": meta, "content_type": 'text'})
+                for table in tables_to_append:
+                    output_doc_lst.append(
+                        {"doc_id": doc_id, "text": table, "vector_id": str(uuid4()),
+                         "meta": meta_table, "content_type": 'table'})
+            # Post-process text to remove headers and footers
+            text_to_append_list = "\f".join([i['text'] for i in output_doc_lst])
+            text_to_append_list = find_and_remove_header_footer(text=text_to_append_list, n_chars=10,
+                                                                                  n_first_pages_to_ignore=0,
+                                                                                  n_last_pages_to_ignore=0).split("\f")
+            for i in range(len(output_doc_lst)):
+                output_doc_lst[i]['text'] = clean_file_name(file_title) + "\n" + text_to_append_list[i]
+        except Exception as e:
+            logger.error(f"Error converting PDF to sections: {e}")
+        return output_doc_lst

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi==0.115.6
+uvicorn==0.34.0
+boto3==1.36.13
+pydantic==2.10.6
+PyPDF2==3.0.1
+docling==2.15.1