Spaces:

dolphinium
/

pc-ai-data-analyst

Sleeping

dolphinium

add concurrent processing for visualization generation and update report streaming

621afd7 6 months ago

34.5 kB

	import gradio as gr
	import json
	import re
	import datetime
	import pandas as pd
	import pysolr
	import google.generativeai as genai
	from sshtunnel import SSHTunnelForwarder
	import matplotlib.pyplot as plt
	import seaborn as sns
	import io
	import os
	import logging
	import concurrent.futures
	from IPython.display import display, Markdown


	# --- Suppress Matplotlib Debug Logs ---
	logging.getLogger('matplotlib').setLevel(logging.WARNING)

	# --- SSH Tunnel Configuration ---
	# It's recommended to load secrets securely, e.g., from environment variables
	SSH_HOST = os.environ.get('SSH_HOST')
	SSH_PORT = 5322
	SSH_USER = os.environ.get('SSH_USER')
	SSH_PASS = os.environ.get('SSH_PASS')

	# --- Solr Configuration ---
	REMOTE_SOLR_HOST = '69.167.186.48'
	REMOTE_SOLR_PORT = 8983
	LOCAL_BIND_PORT = 8983
	SOLR_CORE_NAME = 'news'
	SOLR_USER = os.environ.get('SOLR_USER')
	SOLR_PASS = os.environ.get('SOLR_PASS')

	# --- Google Gemini Configuration ---
	try:
	genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))
	except Exception as e:
	print(f"❌ Gemini API Key Error: {e}. Please ensure 'GEMINI_API_KEY' is set in your environment.")

	# --- Global Variables ---
	ssh_tunnel_server = None
	solr_client = None
	llm_model = None
	is_initialized = False

	try:
	# 1. Start the SSH Tunnel
	ssh_tunnel_server = SSHTunnelForwarder(
	(SSH_HOST, SSH_PORT),
	ssh_username=SSH_USER,
	ssh_password=SSH_PASS,
	remote_bind_address=(REMOTE_SOLR_HOST, REMOTE_SOLR_PORT),
	local_bind_address=('127.0.0.1', LOCAL_BIND_PORT)
	)
	ssh_tunnel_server.start()
	print(f"🚀 SSH tunnel established: Local Port {ssh_tunnel_server.local_bind_port} -> Remote Solr.")

	# 2. Initialize the pysolr client
	solr_url = f'http://127.0.0.1:{ssh_tunnel_server.local_bind_port}/solr/{SOLR_CORE_NAME}'
	solr_client = pysolr.Solr(solr_url, auth=(SOLR_USER, SOLR_PASS), always_commit=True)
	solr_client.ping()
	print(f"✅ Solr connection successful on core '{SOLR_CORE_NAME}'.")

	# 3. Initialize the LLM
	llm_model = genai.GenerativeModel('gemini-1.5-flash', generation_config=genai.types.GenerationConfig(temperature=0))
	print(f"✅ LLM Model '{llm_model.model_name}' initialized.")

	print("✅ System Initialized Successfully.")
	is_initialized = True

	except Exception as e:
	print(f"\n❌ An error occurred during setup: {e}")
	if ssh_tunnel_server and ssh_tunnel_server.is_active:
	ssh_tunnel_server.stop()


	field_metadata = [
	{
	"field_name": "business_model",
	"type": "string (categorical)",
	"example_values": ["pharma/bio", "drug delivery", "pharma services"],
	"definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments."
	},
	{
	"field_name": "news_type",
	"type": "string (categorical)",
	"example_values": ["product news", "financial news", "regulatory news"],
	"definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported."
	},
	{
	"field_name": "event_type",
	"type": "string (categorical)",
	"example_values": ["phase 2", "phase 1", "pre clinical", "marketed"],
	"definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases."
	},
	{
	"field_name": "source",
	"type": "string (categorical)",
	"example_values": ["Press Release", "PR Newswire", "Business Wire"],
	"definition": "The original source of the news article, such as a newswire or official report."
	},
	{
	"field_name": "company_name",
	"type": "string (exact match, for faceting)",
	"example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
	"definition": "The canonical, standardized name of a company. Crucially, you MUST use this field for `terms` faceting to group results by a unique company. Do NOT use this for searching."
	},
	{
	"field_name": "company_name_s",
	"type": "string (multi-valued, for searching)",
	"example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
	"definition": "A field containing all known names and synonyms for a company. You MUST use this field for all `query` parameter searches involving a company name to ensure comprehensive results. Do NOT use for `terms` faceting."
	},
	{
	"field_name": "territory_hq_s",
	"type": "string (multi-valued, hierarchical)",
	"example_values": ["united states of america", "europe", "europe western"],
	"definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location."
	},
	{
	"field_name": "therapeutic_category",
	"type": "string (specific)",
	"example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
	"definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries."
	},
	{
	"field_name": "therapeutic_category_s",
	"type": "string (multi-valued, for searching)",
	"example_values": ["cancer", "oncology", "infections", "cns"],
	"definition": "Broader, multi-valued therapeutic categories and their synonyms. Use this field for broad category searches in the `query` parameter."
	},
	{
	"field_name": "compound_name",
	"type": "string (exact match, for faceting)",
	"example_values": ["opdivo injection solution", "keytruda injection solution"],
	"definition": "The specific, full trade name of a drug. Use this field for `terms` faceting on compounds."
	},
	{
	"field_name": "compound_name_s",
	"type": "string (multi-valued, for searching)",
	"example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
	"definition": "A field with all known trade names and synonyms for a drug. Use this field for all `query` parameter searches involving a compound name."
	},
	{
	"field_name": "molecule_name",
	"type": "string (exact match, for faceting)",
	"example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
	"definition": "The generic, non-proprietary name of the active molecule. Use this field for `terms` faceting on molecules."
	},
	{
	"field_name": "molecule_name_s",
	"type": "string (multi-valued, for searching)",
	"example_values": ["cbd", "s1-220", "a1002n5s"],
	"definition": "A field with all known generic names and synonyms for a molecule. Use this field for all `query` parameter searches involving a molecule name."
	},
	{
	"field_name": "highest_phase",
	"type": "string (categorical)",
	"example_values": ["marketed", "phase 2", "phase 1"],
	"definition": "The highest stage of development a drug has ever reached."
	},
	{
	"field_name": "drug_delivery_branch_s",
	"type": "string (multi-valued, for searching)",
	"example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
	"definition": "The method of drug administration. Use this for `query` parameter searches about route of administration as it contains broader, search-friendly terms."
	},
	{
	"field_name": "drug_delivery_branch",
	"type": "string (categorical, specific, for faceting)",
	"example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
	"definition": "The most specific category of drug delivery technology. Use this field for `terms` faceting on specific delivery technologies."
	},
	{
	"field_name": "route_branch",
	"type": "string (categorical)",
	"example_values": ["injection", "oral", "topical", "inhalation"],
	"definition": "The primary route of drug administration. Good for faceting on exact routes."
	},
	{
	"field_name": "molecule_api_group",
	"type": "string (categorical)",
	"example_values": ["small molecules", "biologics", "nucleic acids"],
	"definition": "High-level classification of the drug's molecular type."
	},
	{
	"field_name": "content",
	"type": "text (full-text search)",
	"example_values": ["The largest study to date...", "balstilimab..."],
	"definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields."
	},
	{
	"field_name": "date",
	"type": "date",
	"example_values": ["2020-10-22T00:00:00Z"],
	"definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries."
	},
	{
	"field_name": "date_year",
	"type": "number (year)",
	"example_values": [2020, 2021, 2022],
	"definition": "The 4-digit year of publication. Use this for queries involving whole years (e.g., 'in 2023', 'last year', 'since 2020')."
	},
	{
	"field_name": "total_deal_value_in_million",
	"type": "number (metric)",
	"example_values": [50, 120.5, 176.157, 1000],
	"definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'."
	}
	]

	# Helper function to format the metadata for the prompt
	def format_metadata_for_prompt(metadata):
	formatted_string = ""
	for field in metadata:
	formatted_string += f"- {field['field_name']}\n"
	formatted_string += f" - Type: {field['type']}\n"
	formatted_string += f" - Definition: {field['definition']}\n"
	formatted_string += f" - Examples: {', '.join(map(str, field['example_values']))}\n\n"
	return formatted_string
	formatted_field_info = format_metadata_for_prompt(field_metadata)


	def parse_suggestions_from_report(report_text):
	"""Extracts numbered suggestions from the report's markdown text."""
	suggestions_match = re.search(r"### (?:Deeper Dive: Suggested Follow-up Analyses\|Suggestions for Further Exploration)\s\n(.?)$", report_text, re.DOTALL \| re.IGNORECASE)
	if not suggestions_match: return []
	suggestions_text = suggestions_match.group(1)
	suggestions = re.findall(r"^\s\d+\.\s(.*)", suggestions_text, re.MULTILINE)
	return [s.strip() for s in suggestions]


	def llm_generate_solr_query_with_history(natural_language_query, field_metadata, chat_history):
	"""Generates a Solr query and facet JSON from a natural language query, considering the conversation history."""
	# Format the chat history for the prompt
	formatted_history = ""
	for user_msg, bot_msg in chat_history:
	# We only need the user's queries for context, not the bot's detailed responses.
	if user_msg:
	# CORRECTED: Properly formatted f-string with a newline character
	formatted_history += f"- User: \"{user_msg}\"\n"

	prompt = f"""
	You are an expert Solr query engineer who converts natural language questions into precise Solr JSON Facet API query objects. Your primary goal is to create a valid JSON object with `query` and `json.facet` keys.

	---
	### CONVERSATIONAL CONTEXT & RULES

	1. Today's Date for Calculations: 2025-07-16
	2. Allowed Facet Types: The `type` key for any facet MUST be one of the following: `terms`, `query`, or `range`. Do not use `date_histogram`. For time-series analysis, use a `range` facet on a date field.
	3. Field Usage: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field.
	4. Facet vs. Query Field Distinction: This is critical.
	* For searching in the main `query` parameter, ALWAYS use the multi-valued search fields (ending in `_s`, like `company_name_s`) to get comprehensive results.
	* For grouping in a `terms` facet, ALWAYS use the canonical, single-value field (e.g., `company_name`, `molecule_name`) to ensure unique and accurate grouping.
	5. *No `count()`*: Do NOT use functions like `count()`. The default facet bucket count is sufficient for counting documents.
	6. Allowed Aggregations: For statistical facets, only use these functions: `sum`, `avg`, `min`, `max`, `unique`. The primary metric field is `total_deal_value_in_million`. The aggregation MUST be a simple string like `"sum(total_deal_value_in_million)"` and not a nested JSON object.
	7. Term Facet Limits: Every `terms` facet MUST include a `limit` key. Default to `limit: 10` unless the user specifies a different number of top results.
	8. Output Format: Your final output must be a single, raw JSON object and nothing else. Do not add comments, explanations, or markdown formatting like ```json.

	---
	### FIELD DEFINITIONS (Your Source of Truth)

	`{formatted_field_info}`
	---
	### CHAT HISTORY
	`{formatted_history}`
	---
	### EXAMPLE OF A FOLLOW-UP QUERY

	Initial User Query: "What are the infections news in this year?"
	```json
	{{
	"query": "date_year:2025 AND therapeutic_category_s:infections",
	"json.facet": {{
	"infections_news_by_type": {{
	"type": "terms",
	"field": "news_type",
	"limit": 10
	}}
	}}
	}}
	```

	Follow-up User Query: "Compare deal values for injection vs oral."

	Correct JSON Output for the Follow-up:
	```json
	{{
	"query": "therapeutic_category_s:infections AND date_year:2025 AND total_deal_value_in_million:[0 TO *]",
	"json.facet": {{
	"injection_deals": {{
	"type": "query",
	"q": "route_branch:injection",
	"facet": {{
	"total_deal_value": "sum(total_deal_value_in_million)"
	}}
	}},
	"oral_deals": {{
	"type": "query",
	"q": "route_branch:oral",
	"facet": {{
	"total_deal_value": "sum(total_deal_value_in_million)"
	}}
	}}
	}}
	}}
	```
	---
	### YOUR TASK

	Now, convert the following user query into a single, raw JSON object with 'query' and 'json.facet' keys, strictly following all rules and field definitions provided above and considering the chat history.

	Current User Query: `{natural_language_query}`
	"""
	try:
	response = llm_model.generate_content(prompt)
	# Using a more robust regex to clean the response
	cleaned_text = re.sub(r'```json\s\|\s```', '', response.text, flags=re.MULTILINE \| re.DOTALL).strip()
	return json.loads(cleaned_text)
	except Exception as e:
	raw_response_text = response.text if 'response' in locals() else 'N/A'
	print(f"Error in llm_generate_solr_query_with_history: {e}\nRaw Response:\n{raw_response_text}")
	return None


	def llm_generate_visualization_code(query_context, facet_data):
	"""Generates Python code for visualization based on query and data."""
	prompt = f"""
	You are a Python Data Visualization expert specializing in Matplotlib and Seaborn.
	Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data.

	User's Analytical Goal:
	"{query_context}"

	Aggregated Data (from Solr Facets):
	```json
	{json.dumps(facet_data, indent=2)}
	```

	---
	### CRITICAL INSTRUCTIONS: CODE GENERATION RULES
	You MUST follow these rules to avoid errors.

	1. Identify the Data Structure FIRST:
	Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below.

	* Pattern A: Simple `terms` Facet. The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts.
	* Pattern B: Multiple `query` Facets. The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection").
	* Pattern C: Nested `terms` Facet. The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`.

	2. Use the Correct Parsing Template:

	---
	TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):
	```python
	import matplotlib.pyplot as plt
	import seaborn as sns
	import pandas as pd

	plt.style.use('seaborn-v0_8-whitegrid')
	fig, ax = plt.subplots(figsize=(12, 8))

	# Dynamically find the main facet key (the one with 'buckets')
	facet_key = None
	for key, value in facet_data.items():
	if isinstance(value, dict) and 'buckets' in value:
	facet_key = key
	break

	if facet_key:
	buckets = facet_data[facet_key].get('buckets', [])
	# Check if buckets contain data
	if buckets:
	df = pd.DataFrame(buckets)
	# Check for a nested metric or use 'count'
	if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc):
	# Example for nested sum metric
	df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0))
	y_axis_label = 'Sum of Total Deal Value'
	else:
	df.rename(columns={{'count': 'value'}}, inplace=True)
	y_axis_label = 'Count'

	sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis')
	ax.set_xlabel('Category')
	ax.set_ylabel(y_axis_label)
	else:
	ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')


	ax.set_title('Your Insightful Title Here')
	# Correct way to rotate labels to prevent errors
	plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
	plt.tight_layout()
	```
	---
	TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):
	```python
	import matplotlib.pyplot as plt
	import seaborn as sns
	import pandas as pd

	plt.style.use('seaborn-v0_8-whitegrid')
	fig, ax = plt.subplots(figsize=(10, 6))

	labels = []
	values = []
	# Iterate through top-level keys, skipping the 'count'
	for key, data_dict in facet_data.items():
	if key == 'count' or not isinstance(data_dict, dict):
	continue
	# Extract the label (e.g., 'oral_deals' -> 'Oral')
	label = key.replace('_deals', '').replace('_', ' ').title()
	# Find the metric value, which is NOT 'count'
	metric_value = 0
	for sub_key, sub_value in data_dict.items():
	if sub_key != 'count':
	metric_value = sub_value
	break # Found the metric
	labels.append(label)
	values.append(metric_value)

	if labels:
	sns.barplot(x=labels, y=values, ax=ax, palette='mako')
	ax.set_ylabel('Total Deal Value') # Or other metric name
	ax.set_xlabel('Category')
	else:
	ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center')


	ax.set_title('Your Insightful Title Here')
	plt.tight_layout()
	```
	---
	TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):
	```python
	import matplotlib.pyplot as plt
	import seaborn as sns
	import pandas as pd

	plt.style.use('seaborn-v0_8-whitegrid')
	fig, ax = plt.subplots(figsize=(14, 8))

	# Find the key that has the buckets
	facet_key = None
	for key, value in facet_data.items():
	if isinstance(value, dict) and 'buckets' in value:
	facet_key = key
	break

	if facet_key and facet_data[facet_key].get('buckets'):
	# This list comprehension is robust for parsing nested metrics
	plot_data = []
	for bucket in facet_data[facet_key]['buckets']:
	category = bucket['val']
	# Find all nested metrics (e.g., total_deal_value_2025)
	for sub_key, sub_value in bucket.items():
	if isinstance(sub_value, dict) and 'sum' in sub_value:
	# Extracts year from 'total_deal_value_2025' -> '2025'
	year = sub_key.split('_')[-1]
	value = sub_value['sum']
	plot_data.append({{'Category': category, 'Year': year, 'Value': value}})

	if plot_data:
	df = pd.DataFrame(plot_data)
	sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax)
	ax.set_ylabel('Total Deal Value')
	ax.set_xlabel('Business Model')
	# Correct way to rotate labels to prevent errors
	plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
	else:
	ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center')
	else:
	ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')

	ax.set_title('Your Insightful Title Here')
	plt.tight_layout()
	```
	---
	3. Final Code Generation:
	- DO NOT include `plt.show()`.
	- DO set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`.
	- DO NOT wrap the code in ```python ... ```. Output only the raw Python code.
	- Adapt the chosen template to the specific keys and metrics in the provided `facet_data`.

	Your Task:
	Now, generate the Python code.
	"""
	try:
	# Increase the timeout for potentially complex generation
	generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048)
	response = llm_model.generate_content(prompt, generation_config=generation_config)
	# Clean the response to remove markdown formatting
	code = re.sub(r'^```python\s\|\s```$', '', response.text, flags=re.MULTILINE)
	return code
	except Exception as e:
	print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}")
	return None

	def execute_viz_code_and_get_path(viz_code, facet_data):
	"""Executes visualization code and returns the path to the saved plot image."""
	if not viz_code: return None
	try:
	if not os.path.exists('/tmp/plots'): os.makedirs('/tmp/plots')
	plot_path = f"/tmp/plots/plot_{datetime.datetime.now().timestamp()}.png"
	# The exec environment needs access to the required libraries and the data
	exec_globals = {'facet_data': facet_data, 'plt': plt, 'sns': sns, 'pd': pd}
	exec(viz_code, exec_globals)
	fig = exec_globals.get('fig')
	if fig:
	fig.savefig(plot_path, bbox_inches='tight')
	plt.close(fig) # Important to free up memory
	return plot_path
	return None
	except Exception as e:
	print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}")
	return None


	def llm_generate_summary_and_suggestions_stream(query_context, facet_data):
	"""
	Yields a streaming analytical report and strategic, context-aware suggestions for further exploration.
	"""
	prompt = f"""
	You are a leading business intelligence analyst and strategist. Your audience is an executive or decision-maker who relies on you to not just present data, but to uncover its meaning and suggest smart next steps.

	Your task is to analyze the provided data, deliver a concise, insightful report, and then propose logical follow-up analyses that could uncover deeper trends or causes.

	Today's Date for Context: {datetime.datetime.now().strftime('%Y-%m-%d')}

	Analysis Context:
	* User's Core Question: "{query_context}"
	* Structured Data (Your Evidence):
	```json
	{json.dumps(facet_data, indent=2)}
	```

	--- INSTRUCTIONS ---

	PART 1: THE ANALYTICAL REPORT
	Structure your report using Markdown. Your tone should be insightful, data-driven, and forward-looking.

	* `## Executive Summary`: A 1-2 sentence, top-line answer to the user's core question. Get straight to the point.

	* `### Key Findings & Insights`: Use bullet points. Don't just state the data; interpret it.
	* Highlight the most significant figures, patterns, or anomalies.
	* Where relevant, calculate key differences or growth rates (e.g., "X is 25% higher than Y").
	* Pinpoint what the visualization or data reveals about the core business question.
	* Data Note: Briefly mention any important caveats if apparent from the data (e.g., a short time frame, a small sample size).

	* `### Context & Implications`: Briefly explain the "so what?" of these findings. What might this mean for our strategy, the market, or operations?

	PART 2: DEEPER DIVE: SUGGESTED FOLLOW-UP ANALYSES
	After the report, create a final section titled `### Deeper Dive: Suggested Follow-up Analyses`.

	* Think like a strategist. Based on the findings, what would you ask next to validate a trend, understand a change, or uncover a root cause?
	* Propose 2-3 logical next questions. These should be concise and framed as natural language questions that inspire further exploration.
	* Focus on comparative and trend analysis. For example:
	* If the user asked for "this year," suggest a comparison: "How does this year's performance in [X] compare to last year?"
	* If a category is a clear leader, suggest breaking it down: "What are the top sub-categories driving the growth in [Leading Category]?"
	* If there's a time-based trend, suggest exploring correlations: "Is the decline in [Metric Z] correlated with changes in any other category during the same period?"
	* Format them as a numbered list.
	* Ensure your suggestions are answerable using the available field definitions below.

	### FIELD DEFINITIONS (Your Source of Truth)
	{formatted_field_info}

	--- YOUR TASK ---
	Generate the full report and the strategic suggestions based on the user's question and the data provided.
	"""
	try:
	response_stream = llm_model.generate_content(prompt, stream=True)
	for chunk in response_stream:
	yield chunk.text
	except Exception as e:
	print(f"Error in llm_generate_summary_and_suggestions_stream: {e}")
	yield "Sorry, I was unable to generate a summary for this data."

	# CORRECTED: Only one, correctly implemented version of this function remains.
	def process_analysis_flow(user_input, history, state):
	"""
	A generator that manages the conversation and yields tuples of UI updates for Gradio.
	This version treats any user input as a new query and considers conversation history.
	"""
	# Initialize state on the first run
	if state is None:
	state = {'query_count': 0, 'last_suggestions': []}

	# If history is None (from a reset), initialize it as an empty list
	if history is None:
	history = []

	# Reset UI components for the new analysis, but keep chat history
	yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False))

	query_context = user_input.strip()
	if not query_context:
	history.append((user_input, "Please enter a question to analyze."))
	yield (history, state, None, None, None, None)
	return

	# 1. Acknowledge and start the process
	history.append((user_input, f"Analyzing: '{query_context}'\n\nGenerating Solr query..."))
	yield (history, state, None, None, None, None)

	# 2. Generate Solr Query with history
	llm_solr_obj = llm_generate_solr_query_with_history(query_context, field_metadata, history)
	if not llm_solr_obj or 'query' not in llm_solr_obj or 'json.facet' not in llm_solr_obj:
	history.append((None, "I'm sorry, I couldn't generate a valid Solr query for that request. Please try rephrasing your question."))
	yield (history, state, None, None, None, None)
	return

	solr_q, solr_facet = llm_solr_obj.get('query'), llm_solr_obj.get('json.facet')
	history.append((None, "✅ Solr query generated!"))
	formatted_query = f"Query:\n```\n{solr_q}\n```\n\nFacet JSON:\n```json\n{json.dumps(solr_facet, indent=2)}\n```"
	yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None)

	# 3. Execute Query
	try:
	history.append((None, "Executing query against the database..."))
	yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None)

	search_params = {"rows": 0, "json.facet": json.dumps(solr_facet)}
	results = solr_client.search(q=solr_q, **search_params)
	facet_data = results.raw_response.get("facets", {})

	formatted_data = f"Facet Data:\n```json\n{json.dumps(facet_data, indent=2)}\n```"


	if not facet_data or facet_data.get('count', 0) == 0:
	history.append((None, "No data was found for your query. Please try a different question."))
	yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))
	return

	# 4. Generate Visualization
	history.append((None, "✅ Data retrieved. Generating visualization..."))
	yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

	with concurrent.futures.ThreadPoolExecutor() as executor:
	# Start visualization generation in the background
	viz_future = executor.submit(llm_generate_visualization_code, query_context, facet_data)

	# 5. Generate and Stream Report
	history.append((None, "✅ Plot created. Streaming final report..."))
	output_report = gr.update(value="", visible=True) # Make it visible before streaming
	yield (history, state, None, output_report, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

	report_text = ""
	# The history object is not modified during streaming, so we pass it once
	# The yield statement for streaming only updates the report text
	stream_history = history[:] # Make a copy
	for chunk in llm_generate_summary_and_suggestions_stream(query_context, facet_data):
	report_text += chunk
	yield (stream_history, state, None, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

	# Update the main history with the final report text
	history.append((None, report_text))

	# Get the visualization code from the future
	viz_code = viz_future.result()
	plot_path = execute_viz_code_and_get_path(viz_code, facet_data)

	output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
	if not plot_path:
	history.append((None, "I was unable to generate a plot for this data.\n"))
	yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

	# 6. Finalize and prompt for next action
	state['query_count'] += 1
	state['last_suggestions'] = parse_suggestions_from_report(report_text)

	next_prompt = "Analysis complete. What would you like to explore next? You can ask a follow-up question, or ask something new."
	history.append((None, next_prompt))
	yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True))

	except Exception as e:
	error_message = f"An unexpected error occurred during analysis: {e}"
	history.append((None, error_message))
	print(f"Error during analysis execution: {e}")
	yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None)


	# --- Gradio UI ---
	with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo:
	state = gr.State()

	with gr.Row():
	with gr.Column(scale=4):
	gr.Markdown("# 💊 PharmaCircle AI Data Analyst")
	with gr.Column(scale=1):
	clear_button = gr.Button("🔄 Start New Analysis", variant="primary")

	gr.Markdown("Ask a question to begin your analysis. I will generate a Solr query, retrieve the data, create a visualization, and write a report. You can then ask follow-up questions freely.")

	with gr.Row():
	with gr.Column(scale=1):
	chatbot = gr.Chatbot(label="Analysis Chat Log", height=700, show_copy_button=True)
	msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True)

	with gr.Column(scale=2):
	with gr.Accordion("Generated Solr Query", open=False):
	solr_query_display = gr.Markdown("Query will appear here...", visible=True)
	with gr.Accordion("Retrieved Solr Data", open=False):
	solr_data_display = gr.Markdown("Data will appear here...", visible=False)
	plot_display = gr.Image(label="Visualization", type="filepath", visible=False)
	report_display = gr.Markdown("Report will be streamed here...", visible=False)

	# --- Event Wiring ---
	def reset_all():
	"""Resets the entire UI for a new analysis session."""
	return (
	[], # chatbot (cleared)
	None, # state (reset)
	"", # msg_textbox (cleared)
	gr.update(value=None, visible=False), # plot_display
	gr.update(value=None, visible=False), # report_display
	gr.update(value=None, visible=False), # solr_query_display
	gr.update(value=None, visible=False) # solr_data_display
	)

	# Main event handler for all user queries
	msg_textbox.submit(
	fn=process_analysis_flow,
	inputs=[msg_textbox, chatbot, state],
	outputs=[chatbot, state, plot_display, report_display, solr_query_display, solr_data_display],
	).then(
	lambda: gr.update(value=""),
	None,
	[msg_textbox],
	queue=False,
	)

	clear_button.click(
	fn=reset_all,
	inputs=None,
	outputs=[chatbot, state, msg_textbox, plot_display, report_display, solr_query_display, solr_data_display],
	queue=False
	)

	if is_initialized:
	demo.queue().launch(debug=True, share=True)
	else:
	print("\nSkipping Gradio launch due to initialization errors.")