Spaces:

LeonceNsh
/

kora-synth

Sleeping

App Files Files Community

LeonceNsh commited on Sep 7, 2025

Commit

029af47

verified ·

1 Parent(s): 349cfdf

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

gradio_app.py +66 -2
response_generator.py +9 -2

gradio_app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import os
 import tempfile
 import pickle
 from io import StringIO
-from typing import Tuple
 import gradio as gr
 import faiss
 import pandas as pd
@@ -71,6 +72,43 @@ def split_explanation_and_csv(raw_text: str) -> Tuple[str, str]:
     return "", text.strip()
 class SyntheticDataApp:
     def __init__(self):
         self.sample_df = None
@@ -81,6 +119,7 @@ class SyntheticDataApp:
     def process_pdf_and_generate_sample(
         self,
         pdf_file,
         llama_key: str,
         openrouter_key: str,
         model_name: str = "google/gemini-flash-1.5",
@@ -108,6 +147,10 @@ class SyntheticDataApp:
             index = generate_faiss_index(embeddings)
             progress(0.6, desc="Generating synthetic data...")
             with tempfile.TemporaryDirectory() as tmpdir:
                 index_path = os.path.join(tmpdir, "faiss_index.index")
                 chunks_path = os.path.join(tmpdir, "text_chunks.pkl")
@@ -121,6 +164,7 @@ class SyntheticDataApp:
                     index_path=index_path,
                     text_chunks_path=chunks_path,
                     max_context_length=8000,
                 )
                 result = generator.generate_synthetic_data(k=int(k_chunks))
                 raw_response = result.get("response", "")
@@ -260,6 +304,25 @@ def create_interface():
                         label="📄 Upload Research Paper (PDF)",
                         file_types=[".pdf"]
                     )
                 with gr.Column(scale=1):
                     gr.HTML("""
@@ -269,6 +332,7 @@ def create_interface():
                             <li>Upload healthcare/medical research papers</li>
                             <li>Ensure PDF contains tables or data descriptions</li>
                             <li>Clear text (not scanned images) works best</li>
                         </ul>
                     </div>
                     """)
@@ -405,7 +469,7 @@ def create_interface():
         # Event handlers
         generate_btn.click(
             fn=app.process_pdf_and_generate_sample,
-            inputs=[pdf_file, llama_key, openrouter_key, model_name, k_chunks],
             outputs=[generation_status, sample_data_preview, explanation_output, sample_download],
             show_progress=True
         ).then(

 import os
 import tempfile
 import pickle
+import json
 from io import StringIO
+from typing import Tuple, Optional
 import gradio as gr
 import faiss
 import pandas as pd
     return "", text.strip()
+def parse_metadata_file(metadata_file) -> Optional[str]:
+    """Parse uploaded metadata file and format for LLM prompt."""
+    if not metadata_file:
+        return None
+    try:
+        with open(metadata_file.name, 'r') as f:
+            content = f.read().strip()
+        # Try to parse as JSON first
+        try:
+            metadata = json.loads(content)
+            if isinstance(metadata, dict):
+                # Format as structured metadata prompt
+                metadata_prompt = "\n\nExpected Data Schema:\n"
+                for column, info in metadata.items():
+                    if isinstance(info, dict):
+                        col_type = info.get('type', 'unknown')
+                        description = info.get('description', '')
+                        metadata_prompt += f"- {column}: {col_type}"
+                        if description:
+                            metadata_prompt += f" - {description}"
+                        metadata_prompt += "\n"
+                    else:
+                        metadata_prompt += f"- {column}: {info}\n"
+                return metadata_prompt
+        except json.JSONDecodeError:
+            pass
+        # If not JSON, treat as plain text metadata
+        return f"\n\nExpected Data Schema:\n{content}\n"
+    except Exception as e:
+        print(f"Error parsing metadata file: {e}")
+        return None
 class SyntheticDataApp:
     def __init__(self):
         self.sample_df = None
     def process_pdf_and_generate_sample(
         self,
         pdf_file,
+        metadata_file,
         llama_key: str,
         openrouter_key: str,
         model_name: str = "google/gemini-flash-1.5",
             index = generate_faiss_index(embeddings)
             progress(0.6, desc="Generating synthetic data...")
+            # Parse metadata if provided
+            metadata_prompt = parse_metadata_file(metadata_file)
             with tempfile.TemporaryDirectory() as tmpdir:
                 index_path = os.path.join(tmpdir, "faiss_index.index")
                 chunks_path = os.path.join(tmpdir, "text_chunks.pkl")
                     index_path=index_path,
                     text_chunks_path=chunks_path,
                     max_context_length=8000,
+                    metadata_context=metadata_prompt
                 )
                 result = generator.generate_synthetic_data(k=int(k_chunks))
                 raw_response = result.get("response", "")
                         label="📄 Upload Research Paper (PDF)",
                         file_types=[".pdf"]
                     )
+                    metadata_file = gr.File(
+                        label="📋 Upload Data Schema/Metadata (Optional)",
+                        file_types=[".json", ".txt", ".md"],
+                        info="Provide expected column types and descriptions to guide data generation"
+                    )
+                    gr.HTML("""
+                    <div style="background-color: #e8f4fd; padding: 0.8em; border-radius: 6px; border: 1px solid #b3d9ff; margin-top: 0.5em;">
+                        <h5>📋 Metadata Format Examples:</h5>
+                        <p><b>JSON format:</b></p>
+                        <pre style="font-size: 0.8em; background-color: #f8f9fa; padding: 0.5em; border-radius: 4px;">
+{
+  "age": {"type": "integer", "description": "Patient age in years"},
+  "gender": {"type": "categorical", "description": "Male/Female"},
+  "blood_pressure": {"type": "float", "description": "Systolic BP in mmHg"}
+}</pre>
+                        <p><b>Text format:</b> Simply describe your expected columns and their types.</p>
+                    </div>
+                    """)
                 with gr.Column(scale=1):
                     gr.HTML("""
                             <li>Upload healthcare/medical research papers</li>
                             <li>Ensure PDF contains tables or data descriptions</li>
                             <li>Clear text (not scanned images) works best</li>
+                            <li>Upload metadata to specify expected column types</li>
                         </ul>
                     </div>
                     """)
         # Event handlers
         generate_btn.click(
             fn=app.process_pdf_and_generate_sample,
+            inputs=[pdf_file, metadata_file, llama_key, openrouter_key, model_name, k_chunks],
             outputs=[generation_status, sample_data_preview, explanation_output, sample_download],
             show_progress=True
         ).then(

response_generator.py CHANGED Viewed

@@ -25,7 +25,8 @@ class SyntheticDataGenerator:
                  embedding_model_name: str = "all-MiniLM-L6-v2",
                  index_path: str = "faiss_index.index",
                  text_chunks_path: str = "text_chunks.pkl",
-                 max_context_length: int = 8000):
         """
         Initializes the SyntheticDataGenerator.
@@ -36,6 +37,7 @@ class SyntheticDataGenerator:
             index_path: Path to the FAISS index of the paper's text.
             text_chunks_path: Path to the pickled text chunks from the paper.
             max_context_length: Maximum context length for the LLM prompt.
         """
         # Set up OpenRouter API key
         if openai_api_key:
@@ -54,6 +56,7 @@ class SyntheticDataGenerator:
         )
         self.model_name = model_name
         self.max_context_length = max_context_length
         # Load embedding model and FAISS index
         print("Loading embedding model and FAISS index...")
@@ -125,15 +128,19 @@ The output should be only the synthetic data in CSV format. Give an explanation
 what their distribution is.
 """
         user_prompt = f"""Based on the following context from a research paper, please generate a synthetic dataset of 100 records.
 Context from the paper:
-{context_string}
 You have to keep the synthetic data with the similar distribution in all features.
 The collinearity between the synthetic data should remain similar to what is mentioned in the paper.
 The distribution of the categorical data should be consistent in the data used regarding if it is imbalanced or not.
 The distribution of numerical data should be consistent with either uniform distribution or binomial distribution or normal distribution.
 This should be applied to each and every feature. If there is a skew in the distribution, you should keep it as it is.
 Give an explanation on what features have been used in the paper, what their distribution is.
 Generate a sample dataset of 100 records of synthetic data for these features.
 """

                  embedding_model_name: str = "all-MiniLM-L6-v2",
                  index_path: str = "faiss_index.index",
                  text_chunks_path: str = "text_chunks.pkl",
+                 max_context_length: int = 8000,
+                 metadata_context: Optional[str] = None):
         """
         Initializes the SyntheticDataGenerator.
             index_path: Path to the FAISS index of the paper's text.
             text_chunks_path: Path to the pickled text chunks from the paper.
             max_context_length: Maximum context length for the LLM prompt.
+            metadata_context: Optional metadata context to guide data generation.
         """
         # Set up OpenRouter API key
         if openai_api_key:
         )
         self.model_name = model_name
         self.max_context_length = max_context_length
+        self.metadata_context = metadata_context
         # Load embedding model and FAISS index
         print("Loading embedding model and FAISS index...")
 what their distribution is.
 """
+        # Build user prompt with optional metadata context
+        metadata_section = self.metadata_context if self.metadata_context else ""
         user_prompt = f"""Based on the following context from a research paper, please generate a synthetic dataset of 100 records.
 Context from the paper:
+{context_string}{metadata_section}
 You have to keep the synthetic data with the similar distribution in all features.
 The collinearity between the synthetic data should remain similar to what is mentioned in the paper.
 The distribution of the categorical data should be consistent in the data used regarding if it is imbalanced or not.
 The distribution of numerical data should be consistent with either uniform distribution or binomial distribution or normal distribution.
 This should be applied to each and every feature. If there is a skew in the distribution, you should keep it as it is.
+{"Please follow the expected data schema provided above when generating the synthetic data." if self.metadata_context else ""}
 Give an explanation on what features have been used in the paper, what their distribution is.
 Generate a sample dataset of 100 records of synthetic data for these features.
 """