EricIhre commited on
Commit
6122580
·
1 Parent(s): 815d5b6

inital commit, copying funcitonalilty from approval ai

Browse files
Files changed (4) hide show
  1. app.py +89 -0
  2. document_processor.py +43 -0
  3. llm_wrapper.py +160 -0
  4. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import gradio as gr
4
+ from product_approval_ai.llm_wrapper import *
5
+ from product_approval_ai.document_processor import *
6
+ import os
7
+ import spaces
8
+
9
+ # --- Step 1: Define a single, decorated function for the entire GPU task ---
10
+ # This function will receive text, load the model, run inference, and return text.
11
+ # It keeps all GPU-related objects inside its own process.
12
+ @spaces.GPU
13
+ def analyse_on_gpu(document_text: str, regulation_text: str) -> str:
14
+ """
15
+ This function runs entirely on the GPU worker.
16
+ 1. It loads the model.
17
+ 2. It performs the analysis.
18
+ 3. It returns the resulting string.
19
+ """
20
+ analyser = LLMAnalyser_BIG(model_name="mistralai/Mistral-7B-Instruct-v0.2")
21
+
22
+ analysis_result = analyser.analyse_document(document_text, regulation_text)
23
+
24
+ return analysis_result
25
+
26
+
27
+ # --- Step 2: Define a simple wrapper function for the Gradio UI ---
28
+ # This function will run in the main process.
29
+ def gradio_interface_function(application_doc, regulation_doc):
30
+ """
31
+ This function is called by the Gradio button click.
32
+ It handles file reading and calls the GPU-accelerated function.
33
+ """
34
+ if application_doc is None or regulation_doc is None:
35
+ return "Error: Please upload both the application form and the regulation file."
36
+
37
+ print(f"Reading application form: {application_doc.name}")
38
+ document_text = read_text_from_docx(application_doc.name)
39
+ print(f"Reading regulation text: {regulation_doc.name}")
40
+ regulation_text = read_text_file(regulation_doc.name)
41
+
42
+ if not document_text or not regulation_text:
43
+ return "Error: Failed to read content from one or both files."
44
+
45
+ print("Files read successfully. Sending to the GPU worker for analysis...")
46
+
47
+ # Call the decorated function. This sends the text to the GPU process
48
+ # and waits for the string result to be returned.
49
+ result_string = analyse_on_gpu(document_text, regulation_text)
50
+
51
+ print("Analysis complete.")
52
+ return result_string
53
+
54
+
55
+ # --- Step 3: Define and launch the Gradio UI ---
56
+ # This all runs in the main process.
57
+ with gr.Blocks() as demo:
58
+ gr.Markdown(
59
+ """
60
+ # Document Compliance Analyser
61
+ Upload an application form (`.docx`).
62
+ The LLM will analyse the document against the regulations and suggest improvements.
63
+ **Note:** The model is loaded on every analysis, so the first run may take a moment.
64
+ """
65
+ )
66
+ with gr.Row():
67
+ app_file = gr.File(label="Upload Application Form (.docx)", file_types=[".docx"])
68
+ reg_file = gr.File(label="Upload Regulation Text (.txt)", file_types=[".txt"])
69
+
70
+ analyze_btn = gr.Button("Analyse Document", variant="primary")
71
+
72
+ output_text = gr.Textbox(
73
+ label="Analysis Results",
74
+ lines=25,
75
+ interactive=False,
76
+ autoscroll=True
77
+ )
78
+
79
+ # Connect the button to our simple interface function
80
+ analyze_btn.click(
81
+ fn=gradio_interface_function,
82
+ inputs=[app_file, reg_file],
83
+ outputs=output_text
84
+ )
85
+
86
+ if __name__ == "__main__":
87
+ demo.launch(theme=gr.themes.Soft())
88
+
89
+
document_processor.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import docx # type: ignore
2
+
3
+ def read_text_from_docx(file_path: str) -> str:
4
+ """
5
+ Extracts all text from a .docx file and returns it as a single string.
6
+ """
7
+ try:
8
+ doc = docx.Document(file_path)
9
+ full_text = []
10
+ for para in doc.paragraphs:
11
+ full_text.append(para.text)
12
+ return '\n'.join(full_text)
13
+ except FileNotFoundError:
14
+ print(f"Error: The file at {file_path} was not found.")
15
+ return ""
16
+ except Exception as e:
17
+ print(f"An error occurred while reading the docx file: {e}")
18
+ return ""
19
+
20
+ def read_text_file(file_path: str) -> str:
21
+ """
22
+ Reads a plain text file and returns its content.
23
+ """
24
+ try:
25
+ with open(file_path, 'r', encoding='utf-8') as f:
26
+ return f.read()
27
+ except FileNotFoundError:
28
+ print(f"Error: The file at {file_path} was not found.")
29
+ return ""
30
+ except Exception as e:
31
+ print(f"An error occurred while reading the text file: {e}")
32
+ return ""
33
+
34
+ '''
35
+ import fitz # type: ignore # PyMuPDF
36
+
37
+ def convert_pdf_to_text(pdf_path):
38
+ text = ""
39
+ with fitz.open(pdf_path) as doc:
40
+ for page in doc:
41
+ text += page.get_text()
42
+ return text
43
+ '''
llm_wrapper.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import torch
3
+
4
+ class LLMAnalyser:
5
+ """
6
+ A wrapper for a Hugging Face language model to analyse documents.
7
+ """
8
+ def __init__(self, model_name: str = "google/flan-t5-large"):
9
+ """
10
+ Initialises the analyser by loading the specified model.
11
+
12
+ Args:
13
+ model_name (str): The name of the Hugging Face model to use.
14
+ """
15
+ print("Initialising LLM Analyser...")
16
+ try:
17
+ # We use the 'text2text-generation' pipeline, which is suitable for instruction-following models.
18
+ # If a GPU is available, the pipeline will use it automatically.
19
+ self.llm_pipeline = pipeline(
20
+ "text2text-generation",
21
+ model=model_name,
22
+ torch_dtype=torch.bfloat16 # Use bfloat16 for memory efficiency if supported
23
+ )
24
+ print(f"Model '{model_name}' loaded successfully.")
25
+ except Exception as e:
26
+ print(f"Failed to load model. Please check your internet connection and library installations. Error: {e}")
27
+ self.llm_pipeline = None
28
+
29
+ def _construct_prompt(self, document_text: str, regulation_text: str) -> str:
30
+ """
31
+ Creates a detailed, structured prompt for the language model.
32
+ """
33
+
34
+ prompt = f"""
35
+ **CONTEXT:**
36
+ You are an expert compliance reviewer. Your task is to analyse a document against a set of regulations and suggest improvements.
37
+
38
+ **DOCUMENT TO ANALYSE:**
39
+ ---
40
+ **DOCUMENT:**
41
+ {document_text}
42
+ ---
43
+ **REGULATIONS:**
44
+ {regulation_text}
45
+
46
+ **YOUR TASK:**
47
+ Based on the regulations provided, analyse the document. Provide a summary of your findings and suggest specific, actionable improvements to make the document compliant. Structure your response in two parts:
48
+ 1. **Compliance Check:** For each point in the regulations, state whether the document complies or not.
49
+ 2. **Suggested Improvements:** Provide a bulleted list of improvements. If the document is already compliant, state that no improvements are needed.
50
+ """
51
+ return prompt
52
+
53
+ def analyse_document(self, document_text: str, regulation_text: str) -> str:
54
+ """
55
+ Analyses the document text against the regulation text using the LLM.
56
+
57
+ Returns:
58
+ A string containing the model's analysis and suggestions.
59
+ """
60
+ if not self.llm_pipeline:
61
+ return "LLM pipeline is not available. Cannot perform analysis."
62
+
63
+ if not document_text or not regulation_text:
64
+ return "Error: Document text or regulation text is empty."
65
+
66
+ prompt = self._construct_prompt(document_text, regulation_text)
67
+
68
+ print("Sending request to the language model... (This may take a moment)")
69
+
70
+ # max_length controls the length of the generated response.
71
+ # You may need to adjust this based on the complexity of your documents.
72
+ try:
73
+ results = self.llm_pipeline(prompt, max_length=512)
74
+ return results[0]['generated_text']
75
+ except Exception as e:
76
+ return f"An error occurred during model inference: {e}"
77
+
78
+
79
+ class LLMAnalyser_BIG:
80
+ """
81
+ A wrapper for a Hugging Face language model to analyse documents.
82
+ """
83
+ def __init__(self, model_name: str):
84
+ """
85
+ Initialises the analyser by loading the specified model.
86
+ """
87
+ print(f"Initialising LLM Analyser ({model_name})...")
88
+ try:
89
+ self.llm_pipeline = pipeline(
90
+ "text-generation",
91
+ model=model_name,
92
+ dtype=torch.bfloat16,
93
+ )
94
+ print(f"Model '{model_name}' loaded successfully.")
95
+ except Exception as e:
96
+ print(f"Failed to load model. Error: {e}")
97
+ self.llm_pipeline = None
98
+
99
+ def _construct_prompt(self, document_text: str, regulation_text: str) -> str:
100
+ """
101
+ Creates a detailed, structured prompt for the language model.
102
+ """
103
+ # For modern chat/instruct models, it's better to use their specific prompt format.
104
+ # For Llama 3 Instruct, the format is as follows:
105
+ messages = [
106
+ {"role": "system", "content": "You are an expert compliance reviewer. Your task is to analyse a document against a set of regulations and suggest improvements."},
107
+ {"role": "user", "content": f"""
108
+ **DOCUMENT TO ANALYSE:**
109
+ ---
110
+ {document_text}
111
+ ---
112
+
113
+ **REGULATIONS:**
114
+ {regulation_text}
115
+
116
+ **YOUR TASK:**
117
+ Based on the regulations provided, analyse the document. Provide a summary of your findings and suggest specific, actionable improvements to make the document compliant. Structure your response in two parts:
118
+ 1. **Compliance Check:** For each point in the regulations, state whether the document complies or not.
119
+ 2. **Suggested Improvements:** Provide a bulleted list of improvements. If the document is already compliant, state that no improvements are needed.
120
+ """}
121
+ ]
122
+
123
+ prompt = self.llm_pipeline.tokenizer.apply_chat_template(
124
+ messages,
125
+ tokenize=False,
126
+ add_generation_prompt=True
127
+ )
128
+ return prompt
129
+
130
+ def analyse_document(self, document_text: str, regulation_text: str) -> str:
131
+ """
132
+ Analyses the document text against the regulation text using the LLM.
133
+
134
+ Returns:
135
+ A string containing the model's analysis and suggestions.
136
+ """
137
+ if not self.llm_pipeline:
138
+ return "LLM pipeline is not available. Cannot perform analysis."
139
+ if not document_text or not regulation_text:
140
+ return "Error: Document text or regulation text is empty."
141
+
142
+ prompt = self._construct_prompt(document_text, regulation_text)
143
+ print("Sending request to the language model... (This may take a moment)")
144
+
145
+ try:
146
+ results = self.llm_pipeline(
147
+ prompt,
148
+ max_new_tokens=1024, # Increased from 512 to allow for more detailed analysis
149
+ do_sample=True,
150
+ temperature=0.6,
151
+ top_p=0.9,
152
+ )
153
+ # The output now contains the full text (prompt + generation).
154
+ # We only want the generated part.
155
+ generated_text = results[0]['generated_text']
156
+ # The generated text starts after the prompt ends.
157
+ return generated_text[len(prompt):]
158
+
159
+ except Exception as e:
160
+ return f"An error occurred during model inference: {e}"
requirements.txt ADDED
Binary file (254 Bytes). View file