Spaces:
Running
Running
| # from langchain_community.embeddings import HuggingFaceEmbeddings | |
| # from langchain_community.embeddings import CrossEncoder | |
| import requests | |
| import numpy as np | |
| import time | |
| import json | |
| # encode the text | |
| from utils.encoding_input import encode_text | |
| # rertrieve and rerank the documents | |
| from utils.retrieve_n_rerank import retrieve_and_rerank | |
| # sentiment analysis on reranked documents | |
| from utils.sentiment_analysis import get_sentiment | |
| # coherence assessment reports | |
| from utils.coherence_bbscore import coherence_report | |
| # Get the vectorstore | |
| from utils.loading_embeddings import get_vectorstore | |
| # build message from model generation | |
| from utils.model_generation import build_messages | |
| import os | |
| import spaces | |
| API_KEY = "sk-do-8Hjf0liuGQCoPwglilL49xiqrthMECwjGP_kAjPM53OTOFQczPyfPK8xJc" | |
| MODEL = "llama3.3-70b-instruct" | |
| def generate_response_stream(query: str, enable_sentiment: bool, enable_coherence: bool): | |
| # Initialize vectorstore when needed | |
| vectorstore = get_vectorstore() | |
| # encoded_input = encode_text(query) | |
| reranked_results = retrieve_and_rerank( | |
| query_text=query, | |
| vectorstore=vectorstore, | |
| k=50, # number of initial documents to retrieve | |
| rerank_model="cross-encoder/ms-marco-MiniLM-L-6-v2", | |
| top_m=20, # number of documents to return after reranking | |
| min_score=0.5, # minimum score for reranked documents | |
| only_docs=False # return both documents and scores | |
| ) | |
| top_docs = [doc for doc, score in reranked_results] | |
| if not top_docs: | |
| yield "No relevant documents found." | |
| return | |
| sentiment_rollup = get_sentiment(top_docs) if enable_sentiment else {} | |
| coherence_report_ = coherence_report(reranked_results=top_docs, input_text= query) if enable_coherence else "" | |
| messages = build_messages( | |
| query=query, | |
| top_docs=top_docs, | |
| task_mode="verbatim_sentiment", | |
| sentiment_rollup=sentiment_rollup, | |
| coherence_report=coherence_report_, | |
| ) | |
| headers = { | |
| "Authorization": f"Bearer {API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| data = { | |
| "model": MODEL, | |
| "messages": messages, | |
| "temperature": 0.2, | |
| "stream": True, | |
| "max_tokens": 2000 | |
| } | |
| collected = "" # Accumulate content to show | |
| with requests.post("https://inference.do-ai.run/v1/chat/completions", headers=headers, json=data, stream=True) as r: | |
| if r.status_code != 200: | |
| yield f"[ERROR] API returned status {r.status_code}: {r.text}" | |
| return | |
| for line in r.iter_lines(decode_unicode=True): | |
| if not line or line.strip() == "data: [DONE]": | |
| continue | |
| if line.startswith("data: "): | |
| line = line[len("data: "):] | |
| try: | |
| chunk = json.loads(line) | |
| delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "") | |
| if delta: | |
| collected += delta | |
| yield collected # yield progressively | |
| time.sleep(0.01) # slight throttle to improve smoothness | |
| except Exception as e: | |
| print("Streaming decode error:", e) | |
| continue | |