# from langchain_community.embeddings import HuggingFaceEmbeddings # from langchain_community.embeddings import CrossEncoder import requests import numpy as np import time import json # encode the text from utils.encoding_input import encode_text # rertrieve and rerank the documents from utils.retrieve_n_rerank import retrieve_and_rerank # sentiment analysis on reranked documents from utils.sentiment_analysis import get_sentiment # coherence assessment reports from utils.coherence_bbscore import coherence_report # Get the vectorstore from utils.loading_embeddings import get_vectorstore # build message from model generation from utils.model_generation import build_messages import os import spaces API_KEY = "sk-do-8Hjf0liuGQCoPwglilL49xiqrthMECwjGP_kAjPM53OTOFQczPyfPK8xJc" MODEL = "llama3.3-70b-instruct" @spaces.GPU(duration=120) def generate_response_stream(query: str, enable_sentiment: bool, enable_coherence: bool): # Initialize vectorstore when needed vectorstore = get_vectorstore() # encoded_input = encode_text(query) reranked_results = retrieve_and_rerank( query_text=query, vectorstore=vectorstore, k=50, # number of initial documents to retrieve rerank_model="cross-encoder/ms-marco-MiniLM-L-6-v2", top_m=20, # number of documents to return after reranking min_score=0.5, # minimum score for reranked documents only_docs=False # return both documents and scores ) top_docs = [doc for doc, score in reranked_results] if not top_docs: yield "No relevant documents found." return sentiment_rollup = get_sentiment(top_docs) if enable_sentiment else {} coherence_report_ = coherence_report(reranked_results=top_docs, input_text= query) if enable_coherence else "" messages = build_messages( query=query, top_docs=top_docs, task_mode="verbatim_sentiment", sentiment_rollup=sentiment_rollup, coherence_report=coherence_report_, ) headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json" } data = { "model": MODEL, "messages": messages, "temperature": 0.2, "stream": True, "max_tokens": 2000 } collected = "" # Accumulate content to show with requests.post("https://inference.do-ai.run/v1/chat/completions", headers=headers, json=data, stream=True) as r: if r.status_code != 200: yield f"[ERROR] API returned status {r.status_code}: {r.text}" return for line in r.iter_lines(decode_unicode=True): if not line or line.strip() == "data: [DONE]": continue if line.startswith("data: "): line = line[len("data: "):] try: chunk = json.loads(line) delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "") if delta: collected += delta yield collected # yield progressively time.sleep(0.01) # slight throttle to improve smoothness except Exception as e: print("Streaming decode error:", e) continue