Spaces:

EricIhre
/

Product_summary_ai

Sleeping

App Files Files Community

EricIhre commited on Jan 22

Commit

6122580

1 Parent(s): 815d5b6

inital commit, copying funcitonalilty from approval ai

Browse files

Files changed (4) hide show

app.py +89 -0
document_processor.py +43 -0
llm_wrapper.py +160 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# app.py
+import gradio as gr
+from product_approval_ai.llm_wrapper import *
+from product_approval_ai.document_processor import *
+import os
+import spaces
+# --- Step 1: Define a single, decorated function for the entire GPU task ---
+# This function will receive text, load the model, run inference, and return text.
+# It keeps all GPU-related objects inside its own process.
+@spaces.GPU
+def analyse_on_gpu(document_text: str, regulation_text: str) -> str:
+    """
+    This function runs entirely on the GPU worker.
+    1. It loads the model.
+    2. It performs the analysis.
+    3. It returns the resulting string.
+    """
+    analyser = LLMAnalyser_BIG(model_name="mistralai/Mistral-7B-Instruct-v0.2")
+    analysis_result = analyser.analyse_document(document_text, regulation_text)
+    return analysis_result
+# --- Step 2: Define a simple wrapper function for the Gradio UI ---
+# This function will run in the main process.
+def gradio_interface_function(application_doc, regulation_doc):
+    """
+    This function is called by the Gradio button click.
+    It handles file reading and calls the GPU-accelerated function.
+    """
+    if application_doc is None or regulation_doc is None:
+        return "Error: Please upload both the application form and the regulation file."
+    print(f"Reading application form: {application_doc.name}")
+    document_text = read_text_from_docx(application_doc.name)
+    print(f"Reading regulation text: {regulation_doc.name}")
+    regulation_text = read_text_file(regulation_doc.name)
+    if not document_text or not regulation_text:
+        return "Error: Failed to read content from one or both files."
+    print("Files read successfully. Sending to the GPU worker for analysis...")
+    # Call the decorated function. This sends the text to the GPU process
+    # and waits for the string result to be returned.
+    result_string = analyse_on_gpu(document_text, regulation_text)
+    print("Analysis complete.")
+    return result_string
+# --- Step 3: Define and launch the Gradio UI ---
+# This all runs in the main process.
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # Document Compliance Analyser
+        Upload an application form (`.docx`).
+        The LLM will analyse the document against the regulations and suggest improvements.
+        **Note:** The model is loaded on every analysis, so the first run may take a moment.
+        """
+    )
+    with gr.Row():
+        app_file = gr.File(label="Upload Application Form (.docx)", file_types=[".docx"])
+        reg_file = gr.File(label="Upload Regulation Text (.txt)", file_types=[".txt"])
+    analyze_btn = gr.Button("Analyse Document", variant="primary")
+    output_text = gr.Textbox(
+        label="Analysis Results",
+        lines=25,
+        interactive=False,
+        autoscroll=True
+    )
+    # Connect the button to our simple interface function
+    analyze_btn.click(
+        fn=gradio_interface_function,
+        inputs=[app_file, reg_file],
+        outputs=output_text
+    )
+if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Soft())

document_processor.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import docx # type: ignore
+def read_text_from_docx(file_path: str) -> str:
+   """
+   Extracts all text from a .docx file and returns it as a single string.
+   """
+   try:
+       doc = docx.Document(file_path)
+       full_text = []
+       for para in doc.paragraphs:
+           full_text.append(para.text)
+       return '\n'.join(full_text)
+   except FileNotFoundError:
+       print(f"Error: The file at {file_path} was not found.")
+       return ""
+   except Exception as e:
+       print(f"An error occurred while reading the docx file: {e}")
+       return ""
+def read_text_file(file_path: str) -> str:
+   """
+   Reads a plain text file and returns its content.
+   """
+   try:
+       with open(file_path, 'r', encoding='utf-8') as f:
+           return f.read()
+   except FileNotFoundError:
+       print(f"Error: The file at {file_path} was not found.")
+       return ""
+   except Exception as e:
+       print(f"An error occurred while reading the text file: {e}")
+       return ""
+'''
+import fitz # type: ignore # PyMuPDF
+def convert_pdf_to_text(pdf_path):
+    text = ""
+    with fitz.open(pdf_path) as doc:
+        for page in doc:
+            text += page.get_text()
+    return text
+'''

llm_wrapper.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from transformers import pipeline
+import torch
+class LLMAnalyser:
+    """
+    A wrapper for a Hugging Face language model to analyse documents.
+    """
+    def __init__(self, model_name: str = "google/flan-t5-large"):
+        """
+        Initialises the analyser by loading the specified model.
+        Args:
+            model_name (str): The name of the Hugging Face model to use.
+        """
+        print("Initialising LLM Analyser...")
+        try:
+            # We use the 'text2text-generation' pipeline, which is suitable for instruction-following models.
+            # If a GPU is available, the pipeline will use it automatically.
+            self.llm_pipeline = pipeline(
+                "text2text-generation",
+                model=model_name,
+                torch_dtype=torch.bfloat16 # Use bfloat16 for memory efficiency if supported
+            )
+            print(f"Model '{model_name}' loaded successfully.")
+        except Exception as e:
+            print(f"Failed to load model. Please check your internet connection and library installations. Error: {e}")
+            self.llm_pipeline = None
+    def _construct_prompt(self, document_text: str, regulation_text: str) -> str:
+        """
+        Creates a detailed, structured prompt for the language model.
+        """
+        prompt = f"""
+        **CONTEXT:**
+        You are an expert compliance reviewer. Your task is to analyse a document against a set of regulations and suggest improvements.
+        **DOCUMENT TO ANALYSE:**
+        ---
+        **DOCUMENT:**
+        {document_text}
+        ---
+        **REGULATIONS:**
+        {regulation_text}
+        **YOUR TASK:**
+        Based on the regulations provided, analyse the document. Provide a summary of your findings and suggest specific, actionable improvements to make the document compliant. Structure your response in two parts:
+        1.  **Compliance Check:** For each point in the regulations, state whether the document complies or not.
+        2.  **Suggested Improvements:** Provide a bulleted list of improvements. If the document is already compliant, state that no improvements are needed.
+        """
+        return prompt
+    def analyse_document(self, document_text: str, regulation_text: str) -> str:
+        """
+        Analyses the document text against the regulation text using the LLM.
+        Returns:
+            A string containing the model's analysis and suggestions.
+        """
+        if not self.llm_pipeline:
+            return "LLM pipeline is not available. Cannot perform analysis."
+        if not document_text or not regulation_text:
+            return "Error: Document text or regulation text is empty."
+        prompt = self._construct_prompt(document_text, regulation_text)
+        print("Sending request to the language model... (This may take a moment)")
+        # max_length controls the length of the generated response.
+        # You may need to adjust this based on the complexity of your documents.
+        try:
+            results = self.llm_pipeline(prompt, max_length=512)
+            return results[0]['generated_text']
+        except Exception as e:
+            return f"An error occurred during model inference: {e}"
+class LLMAnalyser_BIG:
+    """
+    A wrapper for a Hugging Face language model to analyse documents.
+    """
+    def __init__(self, model_name: str):
+        """
+        Initialises the analyser by loading the specified model.
+        """
+        print(f"Initialising LLM Analyser ({model_name})...")
+        try:
+            self.llm_pipeline = pipeline(
+                "text-generation",
+                model=model_name,
+                dtype=torch.bfloat16,
+            )
+            print(f"Model '{model_name}' loaded successfully.")
+        except Exception as e:
+            print(f"Failed to load model. Error: {e}")
+            self.llm_pipeline = None
+    def _construct_prompt(self, document_text: str, regulation_text: str) -> str:
+        """
+        Creates a detailed, structured prompt for the language model.
+        """
+        # For modern chat/instruct models, it's better to use their specific prompt format.
+        # For Llama 3 Instruct, the format is as follows:
+        messages = [
+            {"role": "system", "content": "You are an expert compliance reviewer. Your task is to analyse a document against a set of regulations and suggest improvements."},
+            {"role": "user", "content": f"""
+            **DOCUMENT TO ANALYSE:**
+            ---
+            {document_text}
+            ---
+            **REGULATIONS:**
+            {regulation_text}
+            **YOUR TASK:**
+            Based on the regulations provided, analyse the document. Provide a summary of your findings and suggest specific, actionable improvements to make the document compliant. Structure your response in two parts:
+            1.  **Compliance Check:** For each point in the regulations, state whether the document complies or not.
+            2.  **Suggested Improvements:** Provide a bulleted list of improvements. If the document is already compliant, state that no improvements are needed.
+            """}
+        ]
+        prompt = self.llm_pipeline.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        return prompt
+    def analyse_document(self, document_text: str, regulation_text: str) -> str:
+        """
+        Analyses the document text against the regulation text using the LLM.
+        Returns:
+            A string containing the model's analysis and suggestions.
+        """
+        if not self.llm_pipeline:
+            return "LLM pipeline is not available. Cannot perform analysis."
+        if not document_text or not regulation_text:
+            return "Error: Document text or regulation text is empty."
+        prompt = self._construct_prompt(document_text, regulation_text)
+        print("Sending request to the language model... (This may take a moment)")
+        try:
+            results = self.llm_pipeline(
+                prompt,
+                max_new_tokens=1024, # Increased from 512 to allow for more detailed analysis
+                do_sample=True,
+                temperature=0.6,
+                top_p=0.9,
+            )
+            # The output now contains the full text (prompt + generation).
+            # We only want the generated part.
+            generated_text = results[0]['generated_text']
+            # The generated text starts after the prompt ends.
+            return generated_text[len(prompt):]
+        except Exception as e:
+            return f"An error occurred during model inference: {e}"

requirements.txt ADDED Viewed

Binary file (254 Bytes). View file