Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import math | |
| from typing import List | |
| from langchain_core.messages import SystemMessage, HumanMessage | |
| from langchain_groq import ChatGroq | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| TABULAR_VERBOSE = os.environ.get("TABULAR_VERBOSE", "0") in ("1", "true", "True", "yes", "YES") | |
| # Initialize Groq LLM for tabular data using specialized API key | |
| TABULAR_MODEL = os.environ.get("GROQ_TABULAR_MODEL", os.environ.get("GROQ_MODEL_TABULAR", "qwen/qwen3-32b")) | |
| GROQ_LLM = ChatGroq( | |
| groq_api_key=os.environ.get("GROQ_API_KEY_TABULAR", os.environ.get("GROQ_API_KEY")), | |
| model_name=TABULAR_MODEL | |
| ) | |
| def get_answer_for_tabluar( | |
| data: str, | |
| questions: List[str], | |
| batch_size: int = 10, | |
| verbose: bool = False | |
| ) -> List[str]: | |
| """ | |
| Query Groq LLM for tabular data analysis, handling batches and preserving order of answers. | |
| Args: | |
| data (str): Tabular context in markdown or plain-text. | |
| questions (List[str]): List of questions to ask. | |
| batch_size (int): Max number of questions per batch. | |
| verbose (bool): If True, print raw LLM responses. | |
| Returns: | |
| List[str]: Ordered list of answers corresponding to input questions. | |
| """ | |
| def parse_numbered_answers(text: str, expected: int) -> List[str]: | |
| """ | |
| Parse answers from a numbered list format ('1.', '2.', etc.) | |
| Use non-greedy capture with lookahead to stop at the next number or end. | |
| """ | |
| pattern = re.compile(r"^\s*(\d{1,2})[\.)\-]\s*(.*?)(?=\n\s*\d{1,2}[\.)\-]\s*|$)", re.MULTILINE | re.DOTALL) | |
| matches = pattern.findall(text) | |
| result = {} | |
| for num_str, answer in matches: | |
| try: | |
| num = int(num_str) | |
| except ValueError: | |
| continue | |
| if 1 <= num <= expected: | |
| clean_answer = re.sub(r'\s+', ' ', answer).strip() | |
| result[num] = clean_answer | |
| # If no structured matches, fall back to line-based heuristic | |
| if not result: | |
| lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()] | |
| for i in range(min(expected, len(lines))): | |
| result[i + 1] = lines[i] | |
| # Build fixed-length list | |
| answers = [] | |
| for i in range(1, expected + 1): | |
| answers.append(result.get(i, f"Unable to answer question {i}")) | |
| return answers | |
| if not questions: | |
| return [] | |
| # Process questions in batches | |
| all_answers = [] | |
| total_batches = math.ceil(len(questions) / batch_size) | |
| for batch_idx in range(total_batches): | |
| start = batch_idx * batch_size | |
| end = min(start + batch_size, len(questions)) | |
| batch_questions = questions[start:end] | |
| print(f"Processing batch {batch_idx + 1}/{total_batches} ({len(batch_questions)} questions)") | |
| # Create numbered question list | |
| numbered_questions = "\\n".join([f"{i+1}. {q}" for i, q in enumerate(batch_questions)]) | |
| # Create prompt | |
| system_prompt = """You are an expert data analyst. Analyze the provided tabular data and answer the questions accurately. | |
| Instructions: | |
| - Answer each question based ONLY on the data provided | |
| - If data is insufficient, state "Information not available in the provided data" | |
| - Provide clear, concise answers | |
| - Format your response as a numbered list (1., 2., 3., etc.) | |
| - Do not add explanations unless specifically asked""" | |
| user_prompt = f"""Data: | |
| {data} | |
| Questions: | |
| {numbered_questions} | |
| Please provide numbered answers (1., 2., 3., etc.) for each question.""" | |
| try: | |
| # Create messages | |
| messages = [ | |
| SystemMessage(content=system_prompt), | |
| HumanMessage(content=user_prompt) | |
| ] | |
| # Get response from LLM | |
| response = GROQ_LLM.invoke(messages) | |
| raw_response = response.content or "" | |
| if verbose or TABULAR_VERBOSE: | |
| print(f"🟢 Raw LLM Response (batch {batch_idx + 1}):\n{raw_response[:1200]}\n--- END RAW ---") | |
| # Parse the response | |
| batch_answers = parse_numbered_answers(raw_response, len(batch_questions)) | |
| all_answers.extend(batch_answers) | |
| except Exception as e: | |
| print(f"Error processing batch {batch_idx + 1}: {str(e)}") | |
| # Add error answers for this batch | |
| error_answers = [f"Error processing question: {str(e)}" for _ in batch_questions] | |
| all_answers.extend(error_answers) | |
| return all_answers | |