Spaces:

Shivangsinha
/

DocDetailer

Sleeping

App Files Files Community

Shivangsinha commited on Jul 23, 2024

Commit

31ef0bb

1 Parent(s): 041f935

initial commit

Browse files

Files changed (11) hide show

EngaigeQuery.py +61 -0
Engaigemodelling.py +162 -0
Procfile +1 -0
app.py +246 -0
metadata1.json +0 -0
requirements.txt +9 -0
static/css/styles.css +209 -0
static/js/scripts.js +0 -0
templates/index.html +40 -0
uploads/employee_handbook_print_1.pdf +0 -0
vercel.json +9 -0

EngaigeQuery.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from sentence_transformers import SentenceTransformer
+import faiss
+import json
+import numpy as np
+# Initialize the sentence transformer model
+model = SentenceTransformer('all-MiniLM-L6-v2')
+index_path = 'vector_indexNLP.faiss'
+metadata_path = 'metadataNLP.json'
+# Load FAISS index and metadata
+index = faiss.read_index(index_path)
+with open(metadata_path, 'r') as f:
+    metadata = json.load(f)
+def convert_distance_to_similarity(distance):
+# Assuming the distances are non-negative, we can use a simple conversion:
+    return 1 / (1 + distance)*100
+def query_index(query, model, index, metadata, top_k=5):
+    query_embedding = model.encode(query).reshape(1,-1).astype('float32')
+    D, I = index.search(query_embedding, top_k)
+    results = []
+    for i in range(top_k):
+        doc_metadata = metadata[I[0, i]]
+        similarity_score = convert_distance_to_similarity(D[0, i])
+        result = {
+            "filename": doc_metadata["filename"],
+            "page_num": doc_metadata["page_num"],
+            "standardized_text": doc_metadata["standardized_text"],
+            "question_text":doc_metadata["question_text"],
+            "answerable_text":doc_metadata["answerable_text"],
+            "score":similarity_score
+        }
+        results.append(result)
+    return results
+query = "what is Rule-Based Machine Translation?"
+results = query_index(query, model, index, metadata)
+def create_answer_to_show(query, results):
+    answer = f"Based on your query '{query}', the following relevant information was found:\n\n"
+    for result in results:
+        answer += "\n------------------------------------------------------------------------------------------------------------------\n"
+        answer += f"Filename: {result['filename']}\n"
+        answer += f"Page number: {result['page_num']}\n"
+        answer += f"Related keywords:  {result['question_text'][:100]}...\n"
+        if result['answerable_text']!="":
+            answer += f"Answer: {result['answerable_text'][:500]}\n"
+        answer += f"Relevancy Score: {result['score']}\n"
+    answer += "\nFor more detailed information, please refer to the respective original texts.\n\n\n"
+    return answer
+answer = create_answer_to_show(query, results)
+print(answer)

Engaigemodelling.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import os
+import fitz  # PyMuPDF
+import tensorflow as tf
+from sentence_transformers import SentenceTransformer
+import numpy as np
+import faiss
+import json
+import re
+# This folder should contain all the pdf files which we need to work on . Below given is just an example
+pdf_folder = '/Users/shivangsinha/Downloads/personalProject'
+pdf_text_data = {}
+embeddings = []
+metadata = []
+# Initialize the sentence transformer model
+model = SentenceTransformer('all-MiniLM-L6-v2')
+#model = SentenceTransformer('paraphrase-MiniLM-L6-v2') - Also tried with other model but seems the current one is working better.
+# converting tensor to string so that to store it in json format.
+def tensor_to_string(tensor):
+    return tensor.numpy().decode("utf-8")  # Assuming utf-8 encoding
+# extract text based on page number so that it is more relevant for search.
+def extract_text_from_pdf_with_page_numbers(pdf_path):
+    doc = fitz.open(pdf_path)
+    text_pages = []
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        text = page.get_text()
+        text_pages.append((page_num + 1, text))  # Page numbers are 1-based in fitz
+    return text_pages
+# Making sure inout data is not coming from table of content part and also preprocess all the text which are irrevant for the search.
+def custom_standardization(input_data):
+    # If index pattern is seems to be part of table of content then simply ignore it.
+    index_pattern = re.compile(r'\.{3,}')
+    if bool(index_pattern.search(input_data.numpy().decode('utf-8'))):
+        return ""
+    # Remove URLs
+    stripped_urls = tf.strings.regex_replace(input_data, r"https?://\S+|www\.\S+", "")
+    # Remove email addresses
+    stripped_emails = tf.strings.regex_replace(stripped_urls, r"\S+@\S+", "")
+    # Remove text in angular brackets (usually HTML tags)
+    stripped_brackets = tf.strings.regex_replace(stripped_emails, r"<.*?>", "")
+    # Remove any square brackets and leave the text within square brackets
+    stripped_square_brackets = tf.strings.regex_replace(stripped_brackets, r"\[|\]", "")
+    # Remove alphanumeric characters with digits
+    stripped_digits = tf.strings.regex_replace(stripped_square_brackets, r"\w*\d\w*", "")
+    # Remove non-alphabet characters
+    stripped_non_alpha = tf.strings.regex_replace(stripped_digits, r"[^a-zA-Z\s]", "")
+    # Replace multiple whitespaces with a single whitespace
+    standardized_text = tf.strings.regex_replace(stripped_non_alpha, r"\s+", " ")
+    return standardized_text.numpy().decode('utf-8')
+# For the time being I am using the pattern of question and answer. I am splitting up text into paragraphs which ends with ? mark
+def split_into_paragraphs(text):
+    pattern = r'(?<=\n)(?=\d+\.)'
+    # Split text using the pattern
+    paragraphs = re.split(pattern, text)
+    # Remove leading/trailing whitespace from each paragraph and filter out empty paragraphs
+    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]
+    return paragraphs
+# This part is for storing the vector of a paragraph in a required format
+def text_to_vectors(paragraphs):
+    vectors = model.encode(paragraphs)
+    return vectors
+# This split is used to Answer the query or simply show the relevant text from the book.
+def split_into_qa(text):
+    # Find the last occurrence of a question mark
+    index_pattern = re.compile(r'\.{3,}')
+    # Split the text at each question mark followed by a newline or space
+    match = re.search(r'(.*\?.*?)\n', text, re.DOTALL)
+    # If a match is found, split the text accordingly
+    if match:
+        question = match.group(1).strip()  # The part before the last question mark
+        answer = text[match.end():].strip()  # The part after the last question mark
+        # Filter out index-like entries in both question and answer
+        if index_pattern.search(question):
+            question = ""  # Ignore this as it looks like an index entry
+        if index_pattern.search(answer):
+            answer = ""  # Ignore this as it looks like an index entry
+    else:
+        question = text.strip()  # No question mark found, consider the entire text as the question
+        answer = ""  # No answer part
+    return question, answer
+# storing vector to use it later while querying
+def store_vectors(paragraphs, vectors, metadata, filename, page_num):
+    for i, (paragraph, vector) in enumerate(zip(paragraphs, vectors)):
+        original_text = paragraph
+        question,answer = split_into_qa(original_text)
+        original_text = paragraph[:500]  # Store the first 500 characters of the original text
+        standardized_text = custom_standardization(tf.constant(paragraph))
+        vector = model.encode(standardized_text).tolist()  # Recompute vector for standardized text
+        metadata.append({
+            "index": f'paragraph-{i}',
+            "filename": filename,
+            "page_num": page_num,
+            "standardized_text": standardized_text,
+            "question_text":question,
+            "answerable_text":answer
+        })
+        embeddings.append(vector)
+for filename in os.listdir(pdf_folder):
+    if filename.endswith('.pdf'):
+        pdf_path = os.path.join(pdf_folder, filename)
+        text_pages = extract_text_from_pdf_with_page_numbers(pdf_path)
+        for page_num, text in text_pages:
+            paragraphs = split_into_paragraphs(text)
+            vectors = text_to_vectors(paragraphs)
+            store_vectors(paragraphs, vectors, metadata, filename, page_num)
+        pdf_text_data[filename] = text_pages
+# Save FAISS index and metadata to JSON
+index_path = 'vector_indexNLP.faiss'
+metadata_path = 'metadataNLP.json'
+# Convert embeddings to numpy array for FAISS
+embeddings_array = np.array(embeddings, dtype='float32')
+# Initialize FAISS index
+dimension = embeddings_array.shape[1]  # Dimension of the embeddings
+index = faiss.IndexFlatL2(dimension)
+# Add embeddings in batches to avoid memory issues. I faced some issue while adding index
+batch_size = 1000  # Adjust batch size based on available memory
+for i in range(0, len(embeddings), batch_size):
+    batch_embeddings = embeddings_array[i:i+batch_size]
+    index.add(batch_embeddings)
+# Save the FAISS index
+faiss.write_index(index, index_path)
+# Save metadata
+with open(metadata_path, 'w') as f:
+    json.dump(metadata, f)
+print(f"FAISS index saved to: {index_path}")
+print(f"Metadata saved to: {metadata_path}")

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: gunicorn app:app

app.py ADDED Viewed

	@@ -0,0 +1,246 @@

+from flask import Flask, request, jsonify,render_template
+from flask_cors import CORS
+import requests
+from sentence_transformers import SentenceTransformer
+import faiss
+import json
+import numpy as np
+import os
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from werkzeug.utils import secure_filename
+import fitz  # PyMuPDF
+import tensorflow as tf
+from sentence_transformers import SentenceTransformer
+import numpy as np
+import faiss
+import json
+import re
+import shutil
+app = Flask(__name__)
+CORS(app)  # Enable CORS for all routes
+@app.route('/')
+def index():
+    return render_template('index.html')
+UPLOAD_FOLDER = 'uploads'
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+model = SentenceTransformer('all-MiniLM-L6-v2')
+index_path = 'vector_index1.faiss'
+metadata_path = 'metadata1.json'
+embeddings = []
+metadata = []
+def tensor_to_string(tensor):
+    return tensor.numpy().decode("utf-8")
+def extract_text_from_pdf_with_page_numbers(pdf_path):
+    doc = fitz.open(pdf_path)
+    text_pages = []
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        text = page.get_text()
+        text_pages.append((page_num + 1, text))
+    return text_pages
+def custom_standardization(input_data):
+    index_pattern = re.compile(r'\.{3,}')
+    if bool(index_pattern.search(input_data.numpy().decode('utf-8'))):
+        return ""
+    stripped_urls = tf.strings.regex_replace(input_data, r"https?://\S+|www\.\S+", "")
+    stripped_emails = tf.strings.regex_replace(stripped_urls, r"\S+@\S+", "")
+    stripped_brackets = tf.strings.regex_replace(stripped_emails, r"<.*?>", "")
+    stripped_square_brackets = tf.strings.regex_replace(stripped_brackets, r"\[|\]", "")
+    stripped_digits = tf.strings.regex_replace(stripped_square_brackets, r"\w*\d\w*", "")
+    stripped_non_alpha = tf.strings.regex_replace(stripped_digits, r"[^a-zA-Z\s]", "")
+    standardized_text = tf.strings.regex_replace(stripped_non_alpha, r"\s+", " ")
+    return standardized_text.numpy().decode('utf-8')
+def split_into_paragraphs(text):
+    # pattern = r'(?<=\n)(?=\d+)'
+    paragraphs = re.split(r'(?<=\n)(?=\d+|(?=\n\s*\n))', text)
+    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]
+    return paragraphs
+def text_to_vectors(paragraphs):
+    vectors = model.encode(paragraphs)
+    return vectors
+def split_into_qa(text):
+    # Define the regex pattern to capture the question and answer in one line
+    index_pattern = re.compile(r'\.{3,}')
+    # Split the text at each question mark followed by a newline or space
+    match = re.search(r'(.*\?.*?)\n', text, re.DOTALL)
+    # If a match is found, split the text accordingly
+    if match:
+        question = match.group(1).strip()  # The part before the last question mark
+        answer = text[match.end():].strip()  # The part after the last question mark
+        # Filter out index-like entries in both question and answer
+        if index_pattern.search(question):
+            question = ""  # Ignore this as it looks like an index entry
+        if index_pattern.search(answer):
+            answer = ""  # Ignore this as it looks like an index entry
+    else:
+        question = text.strip()  # No question mark found, consider the entire text as the question
+        answer = ""  # No answer part
+    return question, answer
+def store_vectors(paragraphs, vectors, metadata, filename, page_num):
+    for i, (paragraph, vector) in enumerate(zip(paragraphs, vectors)):
+        original_text = paragraph
+        question, answer = split_into_qa(original_text)
+        original_text = paragraph[:500]
+        standardized_text = custom_standardization(tf.constant(paragraph))
+        vector = model.encode(standardized_text).tolist()
+        metadata.append({
+            "index": f'paragraph-{i}',
+            "filename": filename,
+            "page_num": page_num,
+            "standardized_text": standardized_text,
+            "question_text": question,
+            "answerable_text": answer
+        })
+        embeddings.append(vector)
+@app.route('/upload', methods=['POST'])
+def upload_pdf():
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file part'}), 400
+    file = request.files['file']
+    if file.filename == '':
+        return jsonify({'error': 'No selected file'}), 400
+    if file:
+        # filename = secure_filename(file.filename)
+        # file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        # file.save(file_path)
+        filename = secure_filename(file.filename)
+        # Delete the uploads folder and its contents
+        if os.path.exists(app.config['UPLOAD_FOLDER']):
+            shutil.rmtree(app.config['UPLOAD_FOLDER'])
+        # Recreate the uploads folder
+        os.makedirs(app.config['UPLOAD_FOLDER'])
+        file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        file.save(file_path)
+        try:
+            os.remove('metadata1.json')
+            os.remove('vector_index1.faiss')
+        except OSError as e:
+            print(f"Error: {e.strerror}")
+        process_pdf(file_path, filename)
+        print(file_path+filename)
+        return jsonify({'success': 'File uploaded and processed successfully'})
+def process_pdf(file_path, filename):
+    text_pages = extract_text_from_pdf_with_page_numbers(file_path)
+    for page_num, text in text_pages:
+        paragraphs = split_into_paragraphs(text)
+        vectors = text_to_vectors(paragraphs)
+        store_vectors(paragraphs, vectors, metadata, filename, page_num)
+    save_index_and_metadata()
+def save_index_and_metadata():
+    embeddings_array = np.array(embeddings, dtype='float32')
+    dimension = embeddings_array.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    batch_size = 1000
+    for i in range(0, len(embeddings), batch_size):
+        batch_embeddings = embeddings_array[i:i+batch_size]
+        index.add(batch_embeddings)
+    faiss.write_index(index, index_path)
+    with open(metadata_path, 'w') as f:
+        json.dump(metadata, f)
+# Load FAISS index and metadata
+def convert_distance_to_similarity(distance):
+    # Assuming the distances are non-negative, we can use a simple conversion:
+    return 1 / (1 + distance) * 100
+def query_index(query, model, index, metadata, top_k=5):
+    query_embedding = model.encode(query).reshape(1, -1).astype('float32')
+    D, I = index.search(query_embedding, top_k)
+    results = []
+    for i in range(top_k):
+        doc_metadata = metadata[I[0, i]]
+        similarity_score = convert_distance_to_similarity(D[0, i])
+        result = {
+            "filename": doc_metadata["filename"],
+            "page_num": doc_metadata["page_num"],
+            "standardized_text": doc_metadata["standardized_text"],
+            "question_text": doc_metadata["question_text"],
+            "answerable_text": doc_metadata["answerable_text"],
+            "score": similarity_score
+        }
+        results.append(result)
+    return results
+def fetch_answer_from_external_api(question,result):
+    data = {
+        "messages": [
+            {
+            "content": "Question=" +question + ",answer to look from Uploaded pdf file and dont include the field name from the json file in answer section = " +str(result) + "answer=Based on your PDF provided , ",
+            "role": "user"
+            }
+        ],
+        "model": "mixtral:8x7b-instruct-v0.1-q6_K"
+    }
+    print("data="+str(data))
+    response = requests.post('https://inf.cl.uni-trier.de/chat/', json=data, headers={'accept': 'application/json', 'Content-Type': 'application/json'})
+    response_data = response.json()
+    return response_data.get('response', '')
+def create_answer_to_show(query, results):
+    answer = f"Based on your query '{query}', the following relevant information was found:\n\n"
+    for result in results:
+        answer += "\n------------------------------------------------------------------------------------------------------------------\n"
+        answer += f"Filename: {result['filename']}\n"
+        answer += f"Page number: {result['page_num']}\n"
+        answer += f"Related keywords: {result['question_text']}...\n"
+        if result['answerable_text'] != "":
+            answer += f"Answer: {result['answerable_text'][:500]}\n"
+        answer += f"Relevancy Score: {result['score']}\n"
+    answer += "\nFor more detailed information, please refer to the respective original texts.\n\n\n"
+    return answer
+@app.route('/api/query', methods=['POST'])
+def query_endpoint():
+    data = request.json
+    query = data.get('query', '')
+    top_k = data.get('top_k', 5)
+    index = faiss.read_index(index_path)
+    with open(metadata_path, 'r') as f:
+        metadata = json.load(f)
+    results = query_index(query, model, index, metadata, top_k)
+    formatted_answer = create_answer_to_show(query, results)
+    answer2 = fetch_answer_from_external_api(query,results[0])
+    print("=>"+answer2)
+    return jsonify({'answer': answer2+"\n\n"+formatted_answer })
+if __name__ == '__main__':
+    app.run(debug=True)

metadata1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+faiss_cpu==1.8.0
+sentence_transformers==3.0.1
+tensorflow==2.16.1
+Flask==3.0.3
+Flask-Cors==4.0.1
+numpy
+tf-keras
+PyMuPDF==1.24.5
+gunicorn

static/css/styles.css ADDED Viewed

	@@ -0,0 +1,209 @@

+@import url('https://fonts.googleapis.com/css?family=Exo:400,700');
+* {
+    margin: 0px;
+    padding: 0px;
+    box-sizing: border-box;
+}
+body {
+    font-family: 'Exo', sans-serif;
+}
+.context {
+    width: 100%;
+    position: absolute;
+    top: 30vh;
+}
+.context h1 {
+    text-align: center;
+    color: #fefefe;
+    font-size: 150px;
+}
+.context h3 {
+    text-align: center;
+    color: #e4b714;
+    font-size: 30px;
+}
+.area {
+    background: #000428;
+    background: -webkit-linear-gradient(to right, #000428, #004e92);
+    background: linear-gradient(to right, #000428, #004e92);
+    width: 100%;
+    height: 100vh;
+}
+.circles {
+    position: absolute;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 100%;
+    overflow: hidden;
+}
+.circles li {
+    position: absolute;
+    display: block;
+    list-style: none;
+    width: 20px;
+    height: 20px;
+    background: rgba(255, 255, 255, 0.2);
+    animation: animate 25s linear infinite;
+    bottom: -150px;
+}
+.circles li:nth-child(1) {
+    left: 25%;
+    width: 80px;
+    height: 80px;
+    animation-delay: 0s;
+}
+.circles li:nth-child(2) {
+    left: 10%;
+    width: 20px;
+    height: 20px;
+    animation-delay: 2s;
+    animation-duration: 12s;
+}
+.circles li:nth-child(3) {
+    left: 70%;
+    width: 20px;
+    height: 20px;
+    animation-delay: 4s;
+}
+.circles li:nth-child(4) {
+    left: 40%;
+    width: 60px;
+    height: 60px;
+    animation-delay: 0s;
+    animation-duration: 18s;
+}
+.circles li:nth-child(5) {
+    left: 65%;
+    width: 20px;
+    height: 20px;
+    animation-delay: 0s;
+}
+.circles li:nth-child(6) {
+    left: 75%;
+    width: 110px;
+    height: 110px;
+    animation-delay: 3s;
+}
+.circles li:nth-child(7) {
+    left: 35%;
+    width: 150px;
+    height: 150px;
+    animation-delay: 7s;
+}
+.circles li:nth-child(8) {
+    left: 50%;
+    width: 25px;
+    height: 25px;
+    animation-delay: 15s;
+    animation-duration: 45s;
+}
+.circles li:nth-child(9) {
+    left: 20%;
+    width: 15px;
+    height: 15px;
+    animation-delay: 2s;
+    animation-duration: 35s;
+}
+.circles li:nth-child(10) {
+    left: 85%;
+    width: 150px;
+    height: 150px;
+    animation-delay: 0s;
+    animation-duration: 11s;
+}
+@keyframes animate {
+    0% {
+        transform: translateY(0) rotate(0deg);
+        opacity: 1;
+        border-radius: 0;
+    }
+    100% {
+        transform: translateY(-1000px) rotate(720deg);
+        opacity: 0;
+        border-radius: 50%;
+    }
+}
+.context {
+    text-align: center;
+    color: #fff;
+}
+.button-container {
+    margin-top: 50px;
+}
+.register-button {
+    display: inline-block;
+    padding: 10px 20px;
+    background-color: transparent;
+    border: 2px solid #fff;
+    color: #fff;
+    text-decoration: none;
+    font-size: 18px;
+    border-radius: 15px;
+    transition: background-color 0.3s, color 0.3s;
+}
+.register-button:hover {
+    transform: scale(1.09);
+}
+/* Responsive Design */
+/* For Mobile Devices */
+@media (max-width: 767px) {
+    .context h1 {
+        font-size: 80px;
+        /* Adjust the font size for smaller screens */
+    }
+    .context h3 {
+        font-size: 20px;
+        /* Adjust the font size for smaller screens */
+    }
+}
+/* For Tablet Devices */
+@media (min-width: 768px) and (max-width: 1023px) {
+    .context h1 {
+        font-size: 120px;
+        /* Adjust the font size for tablet screens */
+    }
+    .context h3 {
+        font-size: 25px;
+        /* Adjust the font size for tablet screens */
+    }
+}

static/js/scripts.js ADDED Viewed

File without changes

templates/index.html ADDED Viewed

	@@ -0,0 +1,40 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Shivang - Flask api</title>
+    <link rel="icon" href="/static/images/logo.ico" type="image/x-icon">
+    <link rel="stylesheet" href="/static/css/styles.css">
+    <link href="https://fonts.googleapis.com/css?family=Exo:400,700" rel="stylesheet">
+</head>
+<body>
+    <div class="area">
+        <div class="circles">
+            <ul>
+                <li></li>
+                <li></li>
+                <li></li>
+                <li></li>
+                <li></li>
+                <li></li>
+                <li></li>
+                <li></li>
+                <li></li>
+                <li></li>
+            </ul>
+        </div>
+    </div>
+    <div class="context">
+        <h1>Flask Api calls</h1>
+        <h4>By Shivang sinha</h4>
+        <ul>
+            <li>/api/query</li>
+            <li>/upload</li>
+        </ul>
+    </div>
+</body>
+</html>

uploads/employee_handbook_print_1.pdf ADDED Viewed

Binary file (649 kB). View file

vercel.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "version": 2,
+    "builds": [
+        {"src": "app.py", "use": "@vercel/python"}
+    ],
+    "routes": [
+        {"src": "/(.*)", "dest": "app.py"}
+    ]
+}