Spaces:

kaburia
/

policy-analysis

Running

App Files Files Community

policy-analysis / utils /generation_streaming.py

kaburia

ZeroGPU

0ed976e 6 months ago

raw

history blame contribute delete

3.29 kB

	# from langchain_community.embeddings import HuggingFaceEmbeddings
	# from langchain_community.embeddings import CrossEncoder
	import requests
	import numpy as np
	import time
	import json

	# encode the text
	from utils.encoding_input import encode_text

	# rertrieve and rerank the documents
	from utils.retrieve_n_rerank import retrieve_and_rerank

	# sentiment analysis on reranked documents
	from utils.sentiment_analysis import get_sentiment

	# coherence assessment reports
	from utils.coherence_bbscore import coherence_report

	# Get the vectorstore
	from utils.loading_embeddings import get_vectorstore

	# build message from model generation
	from utils.model_generation import build_messages
	import os
	import spaces

	API_KEY = "sk-do-8Hjf0liuGQCoPwglilL49xiqrthMECwjGP_kAjPM53OTOFQczPyfPK8xJc"
	MODEL = "llama3.3-70b-instruct"

	@spaces.GPU(duration=120)
	def generate_response_stream(query: str, enable_sentiment: bool, enable_coherence: bool):
	# Initialize vectorstore when needed
	vectorstore = get_vectorstore()



	# encoded_input = encode_text(query)



	reranked_results = retrieve_and_rerank(
	query_text=query,
	vectorstore=vectorstore,
	k=50, # number of initial documents to retrieve
	rerank_model="cross-encoder/ms-marco-MiniLM-L-6-v2",
	top_m=20, # number of documents to return after reranking
	min_score=0.5, # minimum score for reranked documents
	only_docs=False # return both documents and scores
	)
	top_docs = [doc for doc, score in reranked_results]

	if not top_docs:
	yield "No relevant documents found."
	return

	sentiment_rollup = get_sentiment(top_docs) if enable_sentiment else {}
	coherence_report_ = coherence_report(reranked_results=top_docs, input_text= query) if enable_coherence else ""

	messages = build_messages(
	query=query,
	top_docs=top_docs,
	task_mode="verbatim_sentiment",
	sentiment_rollup=sentiment_rollup,
	coherence_report=coherence_report_,
	)

	headers = {
	"Authorization": f"Bearer {API_KEY}",
	"Content-Type": "application/json"
	}

	data = {
	"model": MODEL,
	"messages": messages,
	"temperature": 0.2,
	"stream": True,
	"max_tokens": 2000
	}

	collected = "" # Accumulate content to show

	with requests.post("https://inference.do-ai.run/v1/chat/completions", headers=headers, json=data, stream=True) as r:
	if r.status_code != 200:
	yield f"[ERROR] API returned status {r.status_code}: {r.text}"
	return

	for line in r.iter_lines(decode_unicode=True):
	if not line or line.strip() == "data: [DONE]":
	continue
	if line.startswith("data: "):
	line = line[len("data: "):]

	try:
	chunk = json.loads(line)
	delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
	if delta:
	collected += delta
	yield collected # yield progressively
	time.sleep(0.01) # slight throttle to improve smoothness
	except Exception as e:
	print("Streaming decode error:", e)
	continue