Spaces:
Sleeping
Sleeping
inital commit, copying funcitonalilty from approval ai
Browse files- app.py +89 -0
- document_processor.py +43 -0
- llm_wrapper.py +160 -0
- requirements.txt +0 -0
app.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from product_approval_ai.llm_wrapper import *
|
| 5 |
+
from product_approval_ai.document_processor import *
|
| 6 |
+
import os
|
| 7 |
+
import spaces
|
| 8 |
+
|
| 9 |
+
# --- Step 1: Define a single, decorated function for the entire GPU task ---
|
| 10 |
+
# This function will receive text, load the model, run inference, and return text.
|
| 11 |
+
# It keeps all GPU-related objects inside its own process.
|
| 12 |
+
@spaces.GPU
|
| 13 |
+
def analyse_on_gpu(document_text: str, regulation_text: str) -> str:
|
| 14 |
+
"""
|
| 15 |
+
This function runs entirely on the GPU worker.
|
| 16 |
+
1. It loads the model.
|
| 17 |
+
2. It performs the analysis.
|
| 18 |
+
3. It returns the resulting string.
|
| 19 |
+
"""
|
| 20 |
+
analyser = LLMAnalyser_BIG(model_name="mistralai/Mistral-7B-Instruct-v0.2")
|
| 21 |
+
|
| 22 |
+
analysis_result = analyser.analyse_document(document_text, regulation_text)
|
| 23 |
+
|
| 24 |
+
return analysis_result
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# --- Step 2: Define a simple wrapper function for the Gradio UI ---
|
| 28 |
+
# This function will run in the main process.
|
| 29 |
+
def gradio_interface_function(application_doc, regulation_doc):
|
| 30 |
+
"""
|
| 31 |
+
This function is called by the Gradio button click.
|
| 32 |
+
It handles file reading and calls the GPU-accelerated function.
|
| 33 |
+
"""
|
| 34 |
+
if application_doc is None or regulation_doc is None:
|
| 35 |
+
return "Error: Please upload both the application form and the regulation file."
|
| 36 |
+
|
| 37 |
+
print(f"Reading application form: {application_doc.name}")
|
| 38 |
+
document_text = read_text_from_docx(application_doc.name)
|
| 39 |
+
print(f"Reading regulation text: {regulation_doc.name}")
|
| 40 |
+
regulation_text = read_text_file(regulation_doc.name)
|
| 41 |
+
|
| 42 |
+
if not document_text or not regulation_text:
|
| 43 |
+
return "Error: Failed to read content from one or both files."
|
| 44 |
+
|
| 45 |
+
print("Files read successfully. Sending to the GPU worker for analysis...")
|
| 46 |
+
|
| 47 |
+
# Call the decorated function. This sends the text to the GPU process
|
| 48 |
+
# and waits for the string result to be returned.
|
| 49 |
+
result_string = analyse_on_gpu(document_text, regulation_text)
|
| 50 |
+
|
| 51 |
+
print("Analysis complete.")
|
| 52 |
+
return result_string
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# --- Step 3: Define and launch the Gradio UI ---
|
| 56 |
+
# This all runs in the main process.
|
| 57 |
+
with gr.Blocks() as demo:
|
| 58 |
+
gr.Markdown(
|
| 59 |
+
"""
|
| 60 |
+
# Document Compliance Analyser
|
| 61 |
+
Upload an application form (`.docx`).
|
| 62 |
+
The LLM will analyse the document against the regulations and suggest improvements.
|
| 63 |
+
**Note:** The model is loaded on every analysis, so the first run may take a moment.
|
| 64 |
+
"""
|
| 65 |
+
)
|
| 66 |
+
with gr.Row():
|
| 67 |
+
app_file = gr.File(label="Upload Application Form (.docx)", file_types=[".docx"])
|
| 68 |
+
reg_file = gr.File(label="Upload Regulation Text (.txt)", file_types=[".txt"])
|
| 69 |
+
|
| 70 |
+
analyze_btn = gr.Button("Analyse Document", variant="primary")
|
| 71 |
+
|
| 72 |
+
output_text = gr.Textbox(
|
| 73 |
+
label="Analysis Results",
|
| 74 |
+
lines=25,
|
| 75 |
+
interactive=False,
|
| 76 |
+
autoscroll=True
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Connect the button to our simple interface function
|
| 80 |
+
analyze_btn.click(
|
| 81 |
+
fn=gradio_interface_function,
|
| 82 |
+
inputs=[app_file, reg_file],
|
| 83 |
+
outputs=output_text
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
demo.launch(theme=gr.themes.Soft())
|
| 88 |
+
|
| 89 |
+
|
document_processor.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import docx # type: ignore
|
| 2 |
+
|
| 3 |
+
def read_text_from_docx(file_path: str) -> str:
|
| 4 |
+
"""
|
| 5 |
+
Extracts all text from a .docx file and returns it as a single string.
|
| 6 |
+
"""
|
| 7 |
+
try:
|
| 8 |
+
doc = docx.Document(file_path)
|
| 9 |
+
full_text = []
|
| 10 |
+
for para in doc.paragraphs:
|
| 11 |
+
full_text.append(para.text)
|
| 12 |
+
return '\n'.join(full_text)
|
| 13 |
+
except FileNotFoundError:
|
| 14 |
+
print(f"Error: The file at {file_path} was not found.")
|
| 15 |
+
return ""
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(f"An error occurred while reading the docx file: {e}")
|
| 18 |
+
return ""
|
| 19 |
+
|
| 20 |
+
def read_text_file(file_path: str) -> str:
|
| 21 |
+
"""
|
| 22 |
+
Reads a plain text file and returns its content.
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 26 |
+
return f.read()
|
| 27 |
+
except FileNotFoundError:
|
| 28 |
+
print(f"Error: The file at {file_path} was not found.")
|
| 29 |
+
return ""
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"An error occurred while reading the text file: {e}")
|
| 32 |
+
return ""
|
| 33 |
+
|
| 34 |
+
'''
|
| 35 |
+
import fitz # type: ignore # PyMuPDF
|
| 36 |
+
|
| 37 |
+
def convert_pdf_to_text(pdf_path):
|
| 38 |
+
text = ""
|
| 39 |
+
with fitz.open(pdf_path) as doc:
|
| 40 |
+
for page in doc:
|
| 41 |
+
text += page.get_text()
|
| 42 |
+
return text
|
| 43 |
+
'''
|
llm_wrapper.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
class LLMAnalyser:
|
| 5 |
+
"""
|
| 6 |
+
A wrapper for a Hugging Face language model to analyse documents.
|
| 7 |
+
"""
|
| 8 |
+
def __init__(self, model_name: str = "google/flan-t5-large"):
|
| 9 |
+
"""
|
| 10 |
+
Initialises the analyser by loading the specified model.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
model_name (str): The name of the Hugging Face model to use.
|
| 14 |
+
"""
|
| 15 |
+
print("Initialising LLM Analyser...")
|
| 16 |
+
try:
|
| 17 |
+
# We use the 'text2text-generation' pipeline, which is suitable for instruction-following models.
|
| 18 |
+
# If a GPU is available, the pipeline will use it automatically.
|
| 19 |
+
self.llm_pipeline = pipeline(
|
| 20 |
+
"text2text-generation",
|
| 21 |
+
model=model_name,
|
| 22 |
+
torch_dtype=torch.bfloat16 # Use bfloat16 for memory efficiency if supported
|
| 23 |
+
)
|
| 24 |
+
print(f"Model '{model_name}' loaded successfully.")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"Failed to load model. Please check your internet connection and library installations. Error: {e}")
|
| 27 |
+
self.llm_pipeline = None
|
| 28 |
+
|
| 29 |
+
def _construct_prompt(self, document_text: str, regulation_text: str) -> str:
|
| 30 |
+
"""
|
| 31 |
+
Creates a detailed, structured prompt for the language model.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
prompt = f"""
|
| 35 |
+
**CONTEXT:**
|
| 36 |
+
You are an expert compliance reviewer. Your task is to analyse a document against a set of regulations and suggest improvements.
|
| 37 |
+
|
| 38 |
+
**DOCUMENT TO ANALYSE:**
|
| 39 |
+
---
|
| 40 |
+
**DOCUMENT:**
|
| 41 |
+
{document_text}
|
| 42 |
+
---
|
| 43 |
+
**REGULATIONS:**
|
| 44 |
+
{regulation_text}
|
| 45 |
+
|
| 46 |
+
**YOUR TASK:**
|
| 47 |
+
Based on the regulations provided, analyse the document. Provide a summary of your findings and suggest specific, actionable improvements to make the document compliant. Structure your response in two parts:
|
| 48 |
+
1. **Compliance Check:** For each point in the regulations, state whether the document complies or not.
|
| 49 |
+
2. **Suggested Improvements:** Provide a bulleted list of improvements. If the document is already compliant, state that no improvements are needed.
|
| 50 |
+
"""
|
| 51 |
+
return prompt
|
| 52 |
+
|
| 53 |
+
def analyse_document(self, document_text: str, regulation_text: str) -> str:
|
| 54 |
+
"""
|
| 55 |
+
Analyses the document text against the regulation text using the LLM.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
A string containing the model's analysis and suggestions.
|
| 59 |
+
"""
|
| 60 |
+
if not self.llm_pipeline:
|
| 61 |
+
return "LLM pipeline is not available. Cannot perform analysis."
|
| 62 |
+
|
| 63 |
+
if not document_text or not regulation_text:
|
| 64 |
+
return "Error: Document text or regulation text is empty."
|
| 65 |
+
|
| 66 |
+
prompt = self._construct_prompt(document_text, regulation_text)
|
| 67 |
+
|
| 68 |
+
print("Sending request to the language model... (This may take a moment)")
|
| 69 |
+
|
| 70 |
+
# max_length controls the length of the generated response.
|
| 71 |
+
# You may need to adjust this based on the complexity of your documents.
|
| 72 |
+
try:
|
| 73 |
+
results = self.llm_pipeline(prompt, max_length=512)
|
| 74 |
+
return results[0]['generated_text']
|
| 75 |
+
except Exception as e:
|
| 76 |
+
return f"An error occurred during model inference: {e}"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class LLMAnalyser_BIG:
|
| 80 |
+
"""
|
| 81 |
+
A wrapper for a Hugging Face language model to analyse documents.
|
| 82 |
+
"""
|
| 83 |
+
def __init__(self, model_name: str):
|
| 84 |
+
"""
|
| 85 |
+
Initialises the analyser by loading the specified model.
|
| 86 |
+
"""
|
| 87 |
+
print(f"Initialising LLM Analyser ({model_name})...")
|
| 88 |
+
try:
|
| 89 |
+
self.llm_pipeline = pipeline(
|
| 90 |
+
"text-generation",
|
| 91 |
+
model=model_name,
|
| 92 |
+
dtype=torch.bfloat16,
|
| 93 |
+
)
|
| 94 |
+
print(f"Model '{model_name}' loaded successfully.")
|
| 95 |
+
except Exception as e:
|
| 96 |
+
print(f"Failed to load model. Error: {e}")
|
| 97 |
+
self.llm_pipeline = None
|
| 98 |
+
|
| 99 |
+
def _construct_prompt(self, document_text: str, regulation_text: str) -> str:
|
| 100 |
+
"""
|
| 101 |
+
Creates a detailed, structured prompt for the language model.
|
| 102 |
+
"""
|
| 103 |
+
# For modern chat/instruct models, it's better to use their specific prompt format.
|
| 104 |
+
# For Llama 3 Instruct, the format is as follows:
|
| 105 |
+
messages = [
|
| 106 |
+
{"role": "system", "content": "You are an expert compliance reviewer. Your task is to analyse a document against a set of regulations and suggest improvements."},
|
| 107 |
+
{"role": "user", "content": f"""
|
| 108 |
+
**DOCUMENT TO ANALYSE:**
|
| 109 |
+
---
|
| 110 |
+
{document_text}
|
| 111 |
+
---
|
| 112 |
+
|
| 113 |
+
**REGULATIONS:**
|
| 114 |
+
{regulation_text}
|
| 115 |
+
|
| 116 |
+
**YOUR TASK:**
|
| 117 |
+
Based on the regulations provided, analyse the document. Provide a summary of your findings and suggest specific, actionable improvements to make the document compliant. Structure your response in two parts:
|
| 118 |
+
1. **Compliance Check:** For each point in the regulations, state whether the document complies or not.
|
| 119 |
+
2. **Suggested Improvements:** Provide a bulleted list of improvements. If the document is already compliant, state that no improvements are needed.
|
| 120 |
+
"""}
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
prompt = self.llm_pipeline.tokenizer.apply_chat_template(
|
| 124 |
+
messages,
|
| 125 |
+
tokenize=False,
|
| 126 |
+
add_generation_prompt=True
|
| 127 |
+
)
|
| 128 |
+
return prompt
|
| 129 |
+
|
| 130 |
+
def analyse_document(self, document_text: str, regulation_text: str) -> str:
|
| 131 |
+
"""
|
| 132 |
+
Analyses the document text against the regulation text using the LLM.
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
A string containing the model's analysis and suggestions.
|
| 136 |
+
"""
|
| 137 |
+
if not self.llm_pipeline:
|
| 138 |
+
return "LLM pipeline is not available. Cannot perform analysis."
|
| 139 |
+
if not document_text or not regulation_text:
|
| 140 |
+
return "Error: Document text or regulation text is empty."
|
| 141 |
+
|
| 142 |
+
prompt = self._construct_prompt(document_text, regulation_text)
|
| 143 |
+
print("Sending request to the language model... (This may take a moment)")
|
| 144 |
+
|
| 145 |
+
try:
|
| 146 |
+
results = self.llm_pipeline(
|
| 147 |
+
prompt,
|
| 148 |
+
max_new_tokens=1024, # Increased from 512 to allow for more detailed analysis
|
| 149 |
+
do_sample=True,
|
| 150 |
+
temperature=0.6,
|
| 151 |
+
top_p=0.9,
|
| 152 |
+
)
|
| 153 |
+
# The output now contains the full text (prompt + generation).
|
| 154 |
+
# We only want the generated part.
|
| 155 |
+
generated_text = results[0]['generated_text']
|
| 156 |
+
# The generated text starts after the prompt ends.
|
| 157 |
+
return generated_text[len(prompt):]
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
return f"An error occurred during model inference: {e}"
|
requirements.txt
ADDED
|
Binary file (254 Bytes). View file
|
|
|