Spaces:

GABASSI
/

COGNILINE-APP-v1.0

Running

App Files Files Community

Update app.py

by GABASSI - opened 3 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+11

-12

Files changed (1) hide show

app.py +11 -12

app.py CHANGED Viewed

@@ -4,12 +4,12 @@ import base64
 import nest_asyncio
 import shutil
 import logging
-from typing import Any, List, Optional, Dict
-from fastapi import FastAPI, UploadFile, File, Form
 from pydantic import BaseModel
 from huggingface_hub import InferenceClient
-# LlamaIndex Imports
 from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, PromptTemplate
 from llama_index.core.llms import CustomLLM, CompletionResponse, CompletionResponseGen, LLMMetadata
 from llama_index.core.llms.callbacks import llm_completion_callback
@@ -27,7 +27,7 @@ LLAMA_KEY = os.getenv("LLAMA_KEY")
 nest_asyncio.apply()
 app = FastAPI()
-# --- CONECTOR UNIVERSAL HUGGING FACE (A SOLUÇÃO DO ERRO 404) ---
 class HFCustomLLM(CustomLLM):
     model_name: str = "google/gemma-1.1-7b-it"
     token: str = None
@@ -35,7 +35,6 @@ class HFCustomLLM(CustomLLM):
     def __init__(self, model_name: str, token: str, **kwargs: Any):
         super().__init__(model_name=model_name, token=token, **kwargs)
-        # Usa o cliente oficial do Hugging Face que sabe o endereço certo
         self.client = InferenceClient(model=model_name, token=token)
     @property
@@ -44,22 +43,22 @@ class HFCustomLLM(CustomLLM):
     @llm_completion_callback()
     def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
-        # Força o uso do endpoint simples de geração de texto
-        full_prompt = f"User: {prompt}\nAssistant:"
         try:
             response_text = self.client.text_generation(
-                full_prompt,
                 max_new_tokens=512,
                 temperature=0.1,
                 do_sample=True
             )
             return CompletionResponse(text=response_text)
         except Exception as e:
-            raise ValueError(f"Erro no Hugging Face: {str(e)}")
     @llm_completion_callback()
     def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
-        # Simplificação: Não usamos stream para evitar erros de rede
         yield self.complete(prompt, **kwargs)
 # --- SISTEMA COGNILINE ---
@@ -82,7 +81,7 @@ class CoreSystem:
             os.environ["LLAMA_CLOUD_API_KEY"] = llama_key
             # 1. Configura o nosso Conector Universal (Google Gemma)
-            # Este modelo é muito estável e não costuma dar 404
             Settings.llm = HFCustomLLM(
                 model_name="google/gemma-1.1-7b-it",
                 token=hf_token
@@ -224,7 +223,7 @@ function App() {
                     </div>
                 ) : (
                     <div className="p-4 bg-green-900/20 border border-green-900 rounded text-[10px] text-green-400">
-                        SISTEMA OPERACIONAL.<br/>Pronto para análise técnica.
                     </div>
                 )}
             </aside>

 import nest_asyncio
 import shutil
 import logging
+from typing import Any
+from fastapi import FastAPI, UploadFile, File
 from pydantic import BaseModel
 from huggingface_hub import InferenceClient
+# Imports do LlamaIndex
 from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, PromptTemplate
 from llama_index.core.llms import CustomLLM, CompletionResponse, CompletionResponseGen, LLMMetadata
 from llama_index.core.llms.callbacks import llm_completion_callback
 nest_asyncio.apply()
 app = FastAPI()
+# --- CONECTOR UNIVERSAL HUGGING FACE (Solução para Erro 404) ---
 class HFCustomLLM(CustomLLM):
     model_name: str = "google/gemma-1.1-7b-it"
     token: str = None
     def __init__(self, model_name: str, token: str, **kwargs: Any):
         super().__init__(model_name=model_name, token=token, **kwargs)
         self.client = InferenceClient(model=model_name, token=token)
     @property
     @llm_completion_callback()
     def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        # Formata o prompt para o modelo Gemma entender que é um chat
+        formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model"
         try:
             response_text = self.client.text_generation(
+                formatted_prompt,
                 max_new_tokens=512,
                 temperature=0.1,
                 do_sample=True
             )
             return CompletionResponse(text=response_text)
         except Exception as e:
+            raise ValueError(f"Erro na API Hugging Face: {str(e)}")
     @llm_completion_callback()
     def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
+        # Sem stream para garantir estabilidade
         yield self.complete(prompt, **kwargs)
 # --- SISTEMA COGNILINE ---
             os.environ["LLAMA_CLOUD_API_KEY"] = llama_key
             # 1. Configura o nosso Conector Universal (Google Gemma)
+            # Este modelo é muito estável na versão gratuita
             Settings.llm = HFCustomLLM(
                 model_name="google/gemma-1.1-7b-it",
                 token=hf_token
                     </div>
                 ) : (
                     <div className="p-4 bg-green-900/20 border border-green-900 rounded text-[10px] text-green-400">
+                        NÚCLEO OPERACIONAL.<br/>Pronto para análise.
                     </div>
                 )}
             </aside>