Spaces:
Runtime error
Runtime error
File size: 4,495 Bytes
c5ff65d 95418e2 c5ff65d c784407 da51ce6 c5ff65d 95418e2 c5ff65d 95418e2 da51ce6 95418e2 e8f876f 95418e2 c5ff65d 95418e2 c784407 c5ff65d 95418e2 e8f876f c784407 c5ff65d c784407 c5ff65d c784407 e8f876f c784407 e8f876f c5ff65d e8f876f c784407 e8f876f c5ff65d e8f876f c5ff65d e8f876f c5ff65d e8f876f c784407 e8f876f c5ff65d c784407 c5ff65d e8f876f c784407 c5ff65d e8f876f c5ff65d c784407 95418e2 c784407 95418e2 c784407 95418e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# app.py
# FINAL CPU VERSION using a quantized model for maximum reliability on free hardware.
# 1. Import necessary libraries
import gradio as gr
# **FIXED:** Import AutoModelForCausalLM from the main transformers library
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
# 2. Load the Quantized Language Model
# This model is optimized to use less memory, making it stable on free CPUs.
try:
model_name_or_path = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
# Load the quantized model using the standard transformers class.
# The installed 'optimum' and 'auto-gptq' libraries will handle the GPTQ format automatically.
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
use_safetensors=True,
trust_remote_code=False,
device_map="auto" # Will automatically use CPU
)
# Create the text generation pipeline
generator = pipeline(
task="text-generation",
model=model,
tokenizer=tokenizer
)
print("Quantized model loaded successfully on CPU.")
MODEL_LOADED = True
except Exception as e:
print(f"Error loading quantized model: {e}")
generator = None
MODEL_LOADED = False
# 3. Define the core analysis function
def analyze_document(document_text, query_text):
"""
Analyzes the document based on the query using the loaded LLM.
"""
if not MODEL_LOADED or generator is None:
return {"error": "Model is not available. Please check the Space logs for errors."}
# The chat-based prompt format for TinyLlama
messages = [
{
"role": "system",
"content": """You are an expert AI assistant for a claims processing department. Your task is to analyze an insurance policy document and a user's query to make a decision. Based ONLY on the information in the Policy Document, determine if the request should be approved or rejected. Provide your final answer in a strict JSON format. The JSON object must contain three keys: "decision" (string, "Approved" or "Rejected"), "amount" (number, 0 if not applicable), and "justification" (string, explaining your reasoning and citing the policy). Do not use any information outside of the provided Policy Document."""
},
{
"role": "user",
"content": f"""
**Policy Document (Source of Truth):**
---
{document_text}
---
**User Query:**
---
{query_text}
---
**JSON Response:**
"""
}
]
prompt = generator.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
try:
# Generate the response from the LLM
outputs = generator(
prompt,
max_new_tokens=256,
do_sample=True,
temperature=0.7,
top_k=50,
top_p=0.95
)
generated_text = outputs[0]["generated_text"]
# Extract the JSON part from the model's full output
json_start = generated_text.find('{')
json_end = generated_text.rfind('}') + 1
if json_start != -1 and json_end > json_start:
cleaned_json_str = generated_text[json_start:json_end]
import json
return json.loads(cleaned_json_str)
else:
return {"error": "Failed to generate valid JSON.", "raw_output": generated_text}
except Exception as e:
print(f"Error during analysis: {e}")
return {"error": f"An error occurred during analysis: {str(e)}"}
# 4. Create and launch the Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# Policy Analysis API (CPU Version)")
gr.Markdown("This Gradio app serves the backend for the RAG policy analysis system, optimized for CPU.")
with gr.Row():
doc_input = gr.Textbox(lines=5, label="Document Text", placeholder="Paste the document text here...")
query_input = gr.Textbox(label="Query Text", placeholder="Enter your query here...")
output_json = gr.JSON(label="Analysis Result")
analyze_btn = gr.Button("Analyze")
analyze_btn.click(
fn=analyze_document,
inputs=[doc_input, query_input],
outputs=output_json,
api_name="analyze"
)
demo.launch()
|