Spaces:

Shivangsinha
/

DocDetailer

Sleeping

App Files Files Community

DocDetailer / Engaigemodelling.py

Shivangsinha

initial commit

31ef0bb over 1 year ago

raw

history blame contribute delete

6.46 kB

	import os
	import fitz # PyMuPDF
	import tensorflow as tf
	from sentence_transformers import SentenceTransformer
	import numpy as np
	import faiss
	import json
	import re

	# This folder should contain all the pdf files which we need to work on . Below given is just an example
	pdf_folder = '/Users/shivangsinha/Downloads/personalProject'
	pdf_text_data = {}
	embeddings = []
	metadata = []

	# Initialize the sentence transformer model
	model = SentenceTransformer('all-MiniLM-L6-v2')
	#model = SentenceTransformer('paraphrase-MiniLM-L6-v2') - Also tried with other model but seems the current one is working better.

	# converting tensor to string so that to store it in json format.
	def tensor_to_string(tensor):
	return tensor.numpy().decode("utf-8") # Assuming utf-8 encoding

	# extract text based on page number so that it is more relevant for search.
	def extract_text_from_pdf_with_page_numbers(pdf_path):
	doc = fitz.open(pdf_path)
	text_pages = []

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text = page.get_text()
	text_pages.append((page_num + 1, text)) # Page numbers are 1-based in fitz

	return text_pages

	# Making sure inout data is not coming from table of content part and also preprocess all the text which are irrevant for the search.
	def custom_standardization(input_data):

	# If index pattern is seems to be part of table of content then simply ignore it.
	index_pattern = re.compile(r'\.{3,}')
	if bool(index_pattern.search(input_data.numpy().decode('utf-8'))):
	return ""

	# Remove URLs
	stripped_urls = tf.strings.regex_replace(input_data, r"https?://\S+\|www\.\S+", "")

	# Remove email addresses
	stripped_emails = tf.strings.regex_replace(stripped_urls, r"\S+@\S+", "")

	# Remove text in angular brackets (usually HTML tags)
	stripped_brackets = tf.strings.regex_replace(stripped_emails, r"<.*?>", "")

	# Remove any square brackets and leave the text within square brackets
	stripped_square_brackets = tf.strings.regex_replace(stripped_brackets, r"\[\|\]", "")

	# Remove alphanumeric characters with digits
	stripped_digits = tf.strings.regex_replace(stripped_square_brackets, r"\w\d\w", "")

	# Remove non-alphabet characters
	stripped_non_alpha = tf.strings.regex_replace(stripped_digits, r"[^a-zA-Z\s]", "")

	# Replace multiple whitespaces with a single whitespace
	standardized_text = tf.strings.regex_replace(stripped_non_alpha, r"\s+", " ")

	return standardized_text.numpy().decode('utf-8')


	# For the time being I am using the pattern of question and answer. I am splitting up text into paragraphs which ends with ? mark
	def split_into_paragraphs(text):
	pattern = r'(?<=\n)(?=\d+\.)'

	# Split text using the pattern
	paragraphs = re.split(pattern, text)

	# Remove leading/trailing whitespace from each paragraph and filter out empty paragraphs
	paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]

	return paragraphs


	# This part is for storing the vector of a paragraph in a required format
	def text_to_vectors(paragraphs):
	vectors = model.encode(paragraphs)
	return vectors

	# This split is used to Answer the query or simply show the relevant text from the book.
	def split_into_qa(text):
	# Find the last occurrence of a question mark
	index_pattern = re.compile(r'\.{3,}')
	# Split the text at each question mark followed by a newline or space
	match = re.search(r'(.\?.?)\n', text, re.DOTALL)

	# If a match is found, split the text accordingly
	if match:
	question = match.group(1).strip() # The part before the last question mark
	answer = text[match.end():].strip() # The part after the last question mark

	# Filter out index-like entries in both question and answer
	if index_pattern.search(question):
	question = "" # Ignore this as it looks like an index entry
	if index_pattern.search(answer):
	answer = "" # Ignore this as it looks like an index entry
	else:
	question = text.strip() # No question mark found, consider the entire text as the question
	answer = "" # No answer part

	return question, answer

	# storing vector to use it later while querying
	def store_vectors(paragraphs, vectors, metadata, filename, page_num):
	for i, (paragraph, vector) in enumerate(zip(paragraphs, vectors)):
	original_text = paragraph
	question,answer = split_into_qa(original_text)
	original_text = paragraph[:500] # Store the first 500 characters of the original text
	standardized_text = custom_standardization(tf.constant(paragraph))
	vector = model.encode(standardized_text).tolist() # Recompute vector for standardized text
	metadata.append({
	"index": f'paragraph-{i}',
	"filename": filename,
	"page_num": page_num,
	"standardized_text": standardized_text,
	"question_text":question,
	"answerable_text":answer
	})
	embeddings.append(vector)

	for filename in os.listdir(pdf_folder):
	if filename.endswith('.pdf'):
	pdf_path = os.path.join(pdf_folder, filename)
	text_pages = extract_text_from_pdf_with_page_numbers(pdf_path)
	for page_num, text in text_pages:
	paragraphs = split_into_paragraphs(text)
	vectors = text_to_vectors(paragraphs)
	store_vectors(paragraphs, vectors, metadata, filename, page_num)
	pdf_text_data[filename] = text_pages

	# Save FAISS index and metadata to JSON
	index_path = 'vector_indexNLP.faiss'
	metadata_path = 'metadataNLP.json'

	# Convert embeddings to numpy array for FAISS
	embeddings_array = np.array(embeddings, dtype='float32')

	# Initialize FAISS index
	dimension = embeddings_array.shape[1] # Dimension of the embeddings
	index = faiss.IndexFlatL2(dimension)

	# Add embeddings in batches to avoid memory issues. I faced some issue while adding index
	batch_size = 1000 # Adjust batch size based on available memory
	for i in range(0, len(embeddings), batch_size):
	batch_embeddings = embeddings_array[i:i+batch_size]
	index.add(batch_embeddings)

	# Save the FAISS index
	faiss.write_index(index, index_path)

	# Save metadata
	with open(metadata_path, 'w') as f:
	json.dump(metadata, f)

	print(f"FAISS index saved to: {index_path}")
	print(f"Metadata saved to: {metadata_path}")