Spaces:

RocketFarmStudios
/

CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 22

Commit

f260d4a

verified ·

1 Parent(s): 9a0b74b

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -49

app.py CHANGED Viewed

@@ -32,8 +32,10 @@ sys.path.insert(0, src_path)
 from txagent.txagent import TxAgent
 # Constants
-MAX_TOKENS = 32768
-MAX_NEW_TOKENS = 2048
 def clean_response(text: str) -> str:
     try:
@@ -46,40 +48,56 @@ def clean_response(text: str) -> str:
     return text.strip()
 def estimate_tokens(text: str) -> int:
-    return len(text) // 3.5
 def extract_text_from_excel(file_path: str) -> str:
     all_text = []
-    xls = pd.ExcelFile(file_path)
-    for sheet_name in xls.sheet_names:
-        df = xls.parse(sheet_name)
-        df = df.astype(str).fillna("")
-        rows = df.apply(lambda row: " | ".join(row), axis=1)
-        sheet_text = [f"[{sheet_name}] {line}" for line in rows]
-        all_text.extend(sheet_text)
     return "\n".join(all_text)
-def split_text_into_chunks(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
     lines = text.split("\n")
     chunks = []
     current_chunk = []
     current_tokens = 0
     for line in lines:
-        tokens = estimate_tokens(line)
-        if current_tokens + tokens > max_tokens:
-            chunks.append("\n".join(current_chunk))
             current_chunk = [line]
-            current_tokens = tokens
         else:
             current_chunk.append(line)
-            current_tokens += tokens
     if current_chunk:
         chunks.append("\n".join(current_chunk))
     return chunks
 def build_prompt_from_text(chunk: str) -> str:
     return f"""
 ### Unstructured Clinical Records
@@ -100,6 +118,7 @@ Please analyze the above and provide:
 """
 def init_agent():
     default_tool_path = os.path.abspath("data/new_tool.json")
     target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
@@ -120,6 +139,7 @@ def init_agent():
     return agent
 def process_final_report(agent, file, chatbot_state: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], Union[str, None]]:
     messages = chatbot_state if chatbot_state else []
     report_path = None
@@ -131,61 +151,118 @@ def process_final_report(agent, file, chatbot_state: List[Dict[str, str]]) -> Tu
         messages.append({"role": "user", "content": f"Processing Excel file: {os.path.basename(file.name)}"})
         messages.append({"role": "assistant", "content": "⏳ Extracting and analyzing data..."})
         extracted_text = extract_text_from_excel(file.name)
-        chunks = split_text_into_chunks(extracted_text)
         chunk_responses = []
         for i, chunk in enumerate(chunks):
             messages.append({"role": "assistant", "content": f"🔍 Analyzing chunk {i+1}/{len(chunks)}..."})
             prompt = build_prompt_from_text(chunk)
             response = ""
             for result in agent.run_gradio_chat(
-                message=prompt,
                 history=[],
                 temperature=0.2,
                 max_new_tokens=MAX_NEW_TOKENS,
-                max_token=MAX_TOKENS,
                 call_agent=False,
                 conversation=[],
             ):
                 if isinstance(result, str):
-                    response += result
                 elif hasattr(result, "content"):
-                    response += result.content
                 elif isinstance(result, list):
                     for r in result:
                         if hasattr(r, "content"):
-                            response += r.content
-            chunk_responses.append(clean_response(response))
-            messages.append({"role": "assistant", "content": f"✅ Chunk {i+1} analysis complete"})
-        final_prompt = "\n\n".join(chunk_responses) + "\n\nSummarize the key findings above."
-        messages.append({"role": "assistant", "content": "📊 Generating final report..."})
-        stream_text = ""
-        for result in agent.run_gradio_chat(
-            message=final_prompt,
-            history=[],
-            temperature=0.2,
-            max_new_tokens=MAX_NEW_TOKENS,
-            max_token=MAX_TOKENS,
-            call_agent=False,
-            conversation=[],
-        ):
-            if isinstance(result, str):
-                stream_text += result
-            elif hasattr(result, "content"):
-                stream_text += result.content
-            elif isinstance(result, list):
-                for r in result:
-                    if hasattr(r, "content"):
-                        stream_text += r.content
-        final_report = f"# \U0001f9e0 Final Patient Report\n\n{clean_response(stream_text)}"
-        messages[-1]["content"] = f"📊 Final Report:\n\n{clean_response(stream_text)}"
         timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
         report_path = os.path.join(report_dir, f"report_{timestamp}.md")
@@ -200,6 +277,7 @@ def process_final_report(agent, file, chatbot_state: List[Dict[str, str]]) -> Tu
     return messages, report_path
 def create_ui(agent):
     with gr.Blocks(title="Patient History Chat", css=".gradio-container {max-width: 900px !important}") as demo:
         gr.Markdown("## 🏥 Patient History Analysis Tool")

 from txagent.txagent import TxAgent
 # Constants
+MAX_MODEL_TOKENS = 32768  # Model's maximum sequence length
+MAX_CHUNK_TOKENS = 8192   # Chunk size aligned with max_num_batched_tokens
+MAX_NEW_TOKENS = 2048     # Maximum tokens for generation
+PROMPT_OVERHEAD = 500     # Estimated tokens for prompt template overhead
 def clean_response(text: str) -> str:
     try:
     return text.strip()
 def estimate_tokens(text: str) -> int:
+    """Estimate the number of tokens based on character length."""
+    return len(text) // 3.5 + 1  # Add 1 to avoid zero estimates
 def extract_text_from_excel(file_path: str) -> str:
+    """Extract text from all sheets in an Excel file."""
     all_text = []
+    try:
+        xls = pd.ExcelFile(file_path)
+        for sheet_name in xls.sheet_names:
+            df = xls.parse(sheet_name)
+            df = df.astype(str).fillna("")
+            rows = df.apply(lambda row: " | ".join(row), axis=1)
+            sheet_text = [f"[{sheet_name}] {line}" for line in rows]
+            all_text.extend(sheet_text)
+    except Exception as e:
+        raise ValueError(f"Failed to extract text from Excel file: {str(e)}")
     return "\n".join(all_text)
+def split_text_into_chunks(text: str, max_tokens: int = MAX_CHUNK_TOKENS) -> List[str]:
+    """
+    Split text into chunks, ensuring each chunk is within token limits,
+    accounting for prompt overhead.
+    """
+    effective_max_tokens = max_tokens - PROMPT_OVERHEAD
+    if effective_max_tokens <= 0:
+        raise ValueError(f"Effective max tokens ({effective_max_tokens}) must be positive.")
     lines = text.split("\n")
     chunks = []
     current_chunk = []
     current_tokens = 0
     for line in lines:
+        line_tokens = estimate_tokens(line)
+        if current_tokens + line_tokens > effective_max_tokens:
+            if current_chunk:  # Save the current chunk if it's not empty
+                chunks.append("\n".join(current_chunk))
             current_chunk = [line]
+            current_tokens = line_tokens
         else:
             current_chunk.append(line)
+            current_tokens += line_tokens
     if current_chunk:
         chunks.append("\n".join(current_chunk))
     return chunks
 def build_prompt_from_text(chunk: str) -> str:
+    """Build a prompt for analyzing a chunk of clinical data."""
     return f"""
 ### Unstructured Clinical Records
 """
 def init_agent():
+    """Initialize the TxAgent with model and tool configurations."""
     default_tool_path = os.path.abspath("data/new_tool.json")
     target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
     return agent
 def process_final_report(agent, file, chatbot_state: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], Union[str, None]]:
+    """Process the Excel file and generate a final report."""
     messages = chatbot_state if chatbot_state else []
     report_path = None
         messages.append({"role": "user", "content": f"Processing Excel file: {os.path.basename(file.name)}"})
         messages.append({"role": "assistant", "content": "⏳ Extracting and analyzing data..."})
+        # Extract text and split into chunks
         extracted_text = extract_text_from_excel(file.name)
+        chunks = split_text_into_chunks(extracted_text, max_tokens=MAX_CHUNK_TOKENS)
         chunk_responses = []
+        # Process each chunk
         for i, chunk in enumerate(chunks):
             messages.append({"role": "assistant", "content": f"🔍 Analyzing chunk {i+1}/{len(chunks)}..."})
             prompt = build_prompt_from_text(chunk)
+            prompt_tokens = estimate_tokens(prompt)
+            if prompt_tokens > MAX_MODEL_TOKENS:
+                messages.append({"role": "assistant", "content": f"❌ Chunk {i+1} prompt too long ({prompt_tokens} tokens). Skipping..."})
+                continue
             response = ""
+            try:
+                for result in agent.run_gradio_chat(
+                    message=prompt,
+                    history=[],
+                    temperature=0.2,
+                    max_new_tokens=MAX_NEW_TOKENS,
+                    max_token=MAX_MODEL_TOKENS,
+                    call_agent=False,
+                    conversation=[],
+                ):
+                    if isinstance(result, str):
+                        response += result
+                    elif hasattr(result, "content"):
+                        response += result.content
+                    elif isinstance(result, list):
+                        for r in result:
+                            if hasattr(r, "content"):
+                                response += r.content
+            except Exception as e:
+                messages.append({"role": "assistant", "content": f"❌ Error analyzing chunk {i+1}: {str(e)}"})
+                continue
+            chunk_responses.append(clean_response(response))
+            messages.append({"role": "assistant", "content": f"✅ Chunk {i+1} analysis complete"})
+        if not chunk_responses:
+            messages.append({"role": "assistant", "content": "❌ No valid chunk responses to summarize."})
+            return messages, report_path
+        # Summarize chunk responses incrementally to avoid token limit
+        summary = ""
+        current_summary_tokens = 0
+        for i, response in enumerate(chunk_responses):
+            response_tokens = estimate_tokens(response)
+            if current_summary_tokens + response_tokens > MAX_MODEL_TOKENS - PROMPT_OVERHEAD - MAX_NEW_TOKENS:
+                # Summarize current summary
+                summary_prompt = f"Summarize the following analysis:\n\n{summary}\n\nProvide a concise summary."
+                summary_response = ""
+                try:
+                    for result in agent.run_gradio_chat(
+                        message=summary_prompt,
+                        history=[],
+                        temperature=0.2,
+                        max_new_tokens=MAX_NEW_TOKENS,
+                        max_token=MAX_MODEL_TOKENS,
+                        call_agent=False,
+                        conversation=[],
+                    ):
+                        if isinstance(result, str):
+                            summary_response += result
+                        elif hasattr(result, "content"):
+                            summary_response += result.content
+                        elif isinstance(result, list):
+                            for r in result:
+                                if hasattr(r, "content"):
+                                    summary_response += r.content
+                    summary = clean_response(summary_response)
+                    current_summary_tokens = estimate_tokens(summary)
+                except Exception as e:
+                    messages.append({"role": "assistant", "content": f"❌ Error summarizing intermediate results: {str(e)}"})
+                    return messages, report_path
+            summary += f"\n\n### Chunk {i+1} Analysis\n{response}"
+            current_summary_tokens += response_tokens
+        # Final summarization
+        final_prompt = f"Summarize the key findings from the following analyses:\n\n{summary}"
+        messages.append({"role": "assistant", "content": "📊 Generating final report..."})
+        final_report_text = ""
+        try:
             for result in agent.run_gradio_chat(
+                message=final_prompt,
                 history=[],
                 temperature=0.2,
                 max_new_tokens=MAX_NEW_TOKENS,
+                max_token=MAX_MODEL_TOKENS,
                 call_agent=False,
                 conversation=[],
             ):
                 if isinstance(result, str):
+                    final_report_text += result
                 elif hasattr(result, "content"):
+                    final_report_text += result.content
                 elif isinstance(result, list):
                     for r in result:
                         if hasattr(r, "content"):
+                            final_report_text += r.content
+        except Exception as e:
+            messages.append({"role": "assistant", "content": f"❌ Error generating final report: {str(e)}"})
+            return messages, report_path
+        final_report = f"# \U0001f9e0 Final Patient Report\n\n{clean_response(final_report_text)}"
+        messages[-1]["content"] = f"📊 Final Report:\n\n{clean_response(final_report_text)}"
+        # Save the report
         timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
         report_path = os.path.join(report_dir, f"report_{timestamp}.md")
     return messages, report_path
 def create_ui(agent):
+    """Create the Gradio UI for the patient history analysis tool."""
     with gr.Blocks(title="Patient History Chat", css=".gradio-container {max-width: 900px !important}") as demo:
         gr.Markdown("## 🏥 Patient History Analysis Tool")