# app.py # FINAL CPU VERSION using a quantized model for maximum reliability on free hardware. # 1. Import necessary libraries import gradio as gr # **FIXED:** Import AutoModelForCausalLM from the main transformers library from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch # 2. Load the Quantized Language Model # This model is optimized to use less memory, making it stable on free CPUs. try: model_name_or_path = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ" # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) # Load the quantized model using the standard transformers class. # The installed 'optimum' and 'auto-gptq' libraries will handle the GPTQ format automatically. model = AutoModelForCausalLM.from_pretrained( model_name_or_path, use_safetensors=True, trust_remote_code=False, device_map="auto" # Will automatically use CPU ) # Create the text generation pipeline generator = pipeline( task="text-generation", model=model, tokenizer=tokenizer ) print("Quantized model loaded successfully on CPU.") MODEL_LOADED = True except Exception as e: print(f"Error loading quantized model: {e}") generator = None MODEL_LOADED = False # 3. Define the core analysis function def analyze_document(document_text, query_text): """ Analyzes the document based on the query using the loaded LLM. """ if not MODEL_LOADED or generator is None: return {"error": "Model is not available. Please check the Space logs for errors."} # The chat-based prompt format for TinyLlama messages = [ { "role": "system", "content": """You are an expert AI assistant for a claims processing department. Your task is to analyze an insurance policy document and a user's query to make a decision. Based ONLY on the information in the Policy Document, determine if the request should be approved or rejected. Provide your final answer in a strict JSON format. The JSON object must contain three keys: "decision" (string, "Approved" or "Rejected"), "amount" (number, 0 if not applicable), and "justification" (string, explaining your reasoning and citing the policy). Do not use any information outside of the provided Policy Document.""" }, { "role": "user", "content": f""" **Policy Document (Source of Truth):** --- {document_text} --- **User Query:** --- {query_text} --- **JSON Response:** """ } ] prompt = generator.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) try: # Generate the response from the LLM outputs = generator( prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95 ) generated_text = outputs[0]["generated_text"] # Extract the JSON part from the model's full output json_start = generated_text.find('{') json_end = generated_text.rfind('}') + 1 if json_start != -1 and json_end > json_start: cleaned_json_str = generated_text[json_start:json_end] import json return json.loads(cleaned_json_str) else: return {"error": "Failed to generate valid JSON.", "raw_output": generated_text} except Exception as e: print(f"Error during analysis: {e}") return {"error": f"An error occurred during analysis: {str(e)}"} # 4. Create and launch the Gradio Interface with gr.Blocks() as demo: gr.Markdown("# Policy Analysis API (CPU Version)") gr.Markdown("This Gradio app serves the backend for the RAG policy analysis system, optimized for CPU.") with gr.Row(): doc_input = gr.Textbox(lines=5, label="Document Text", placeholder="Paste the document text here...") query_input = gr.Textbox(label="Query Text", placeholder="Enter your query here...") output_json = gr.JSON(label="Analysis Result") analyze_btn = gr.Button("Analyze") analyze_btn.click( fn=analyze_document, inputs=[doc_input, query_input], outputs=output_json, api_name="analyze" ) demo.launch()