LeonceNsh commited on
Commit
029af47
·
verified ·
1 Parent(s): 349cfdf

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. gradio_app.py +66 -2
  2. response_generator.py +9 -2
gradio_app.py CHANGED
@@ -1,8 +1,9 @@
1
  import os
2
  import tempfile
3
  import pickle
 
4
  from io import StringIO
5
- from typing import Tuple
6
  import gradio as gr
7
  import faiss
8
  import pandas as pd
@@ -71,6 +72,43 @@ def split_explanation_and_csv(raw_text: str) -> Tuple[str, str]:
71
  return "", text.strip()
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  class SyntheticDataApp:
75
  def __init__(self):
76
  self.sample_df = None
@@ -81,6 +119,7 @@ class SyntheticDataApp:
81
  def process_pdf_and_generate_sample(
82
  self,
83
  pdf_file,
 
84
  llama_key: str,
85
  openrouter_key: str,
86
  model_name: str = "google/gemini-flash-1.5",
@@ -108,6 +147,10 @@ class SyntheticDataApp:
108
  index = generate_faiss_index(embeddings)
109
 
110
  progress(0.6, desc="Generating synthetic data...")
 
 
 
 
111
  with tempfile.TemporaryDirectory() as tmpdir:
112
  index_path = os.path.join(tmpdir, "faiss_index.index")
113
  chunks_path = os.path.join(tmpdir, "text_chunks.pkl")
@@ -121,6 +164,7 @@ class SyntheticDataApp:
121
  index_path=index_path,
122
  text_chunks_path=chunks_path,
123
  max_context_length=8000,
 
124
  )
125
  result = generator.generate_synthetic_data(k=int(k_chunks))
126
  raw_response = result.get("response", "")
@@ -260,6 +304,25 @@ def create_interface():
260
  label="📄 Upload Research Paper (PDF)",
261
  file_types=[".pdf"]
262
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
  with gr.Column(scale=1):
265
  gr.HTML("""
@@ -269,6 +332,7 @@ def create_interface():
269
  <li>Upload healthcare/medical research papers</li>
270
  <li>Ensure PDF contains tables or data descriptions</li>
271
  <li>Clear text (not scanned images) works best</li>
 
272
  </ul>
273
  </div>
274
  """)
@@ -405,7 +469,7 @@ def create_interface():
405
  # Event handlers
406
  generate_btn.click(
407
  fn=app.process_pdf_and_generate_sample,
408
- inputs=[pdf_file, llama_key, openrouter_key, model_name, k_chunks],
409
  outputs=[generation_status, sample_data_preview, explanation_output, sample_download],
410
  show_progress=True
411
  ).then(
 
1
  import os
2
  import tempfile
3
  import pickle
4
+ import json
5
  from io import StringIO
6
+ from typing import Tuple, Optional
7
  import gradio as gr
8
  import faiss
9
  import pandas as pd
 
72
  return "", text.strip()
73
 
74
 
75
+ def parse_metadata_file(metadata_file) -> Optional[str]:
76
+ """Parse uploaded metadata file and format for LLM prompt."""
77
+ if not metadata_file:
78
+ return None
79
+
80
+ try:
81
+ with open(metadata_file.name, 'r') as f:
82
+ content = f.read().strip()
83
+
84
+ # Try to parse as JSON first
85
+ try:
86
+ metadata = json.loads(content)
87
+ if isinstance(metadata, dict):
88
+ # Format as structured metadata prompt
89
+ metadata_prompt = "\n\nExpected Data Schema:\n"
90
+ for column, info in metadata.items():
91
+ if isinstance(info, dict):
92
+ col_type = info.get('type', 'unknown')
93
+ description = info.get('description', '')
94
+ metadata_prompt += f"- {column}: {col_type}"
95
+ if description:
96
+ metadata_prompt += f" - {description}"
97
+ metadata_prompt += "\n"
98
+ else:
99
+ metadata_prompt += f"- {column}: {info}\n"
100
+ return metadata_prompt
101
+ except json.JSONDecodeError:
102
+ pass
103
+
104
+ # If not JSON, treat as plain text metadata
105
+ return f"\n\nExpected Data Schema:\n{content}\n"
106
+
107
+ except Exception as e:
108
+ print(f"Error parsing metadata file: {e}")
109
+ return None
110
+
111
+
112
  class SyntheticDataApp:
113
  def __init__(self):
114
  self.sample_df = None
 
119
  def process_pdf_and_generate_sample(
120
  self,
121
  pdf_file,
122
+ metadata_file,
123
  llama_key: str,
124
  openrouter_key: str,
125
  model_name: str = "google/gemini-flash-1.5",
 
147
  index = generate_faiss_index(embeddings)
148
 
149
  progress(0.6, desc="Generating synthetic data...")
150
+
151
+ # Parse metadata if provided
152
+ metadata_prompt = parse_metadata_file(metadata_file)
153
+
154
  with tempfile.TemporaryDirectory() as tmpdir:
155
  index_path = os.path.join(tmpdir, "faiss_index.index")
156
  chunks_path = os.path.join(tmpdir, "text_chunks.pkl")
 
164
  index_path=index_path,
165
  text_chunks_path=chunks_path,
166
  max_context_length=8000,
167
+ metadata_context=metadata_prompt
168
  )
169
  result = generator.generate_synthetic_data(k=int(k_chunks))
170
  raw_response = result.get("response", "")
 
304
  label="📄 Upload Research Paper (PDF)",
305
  file_types=[".pdf"]
306
  )
307
+ metadata_file = gr.File(
308
+ label="📋 Upload Data Schema/Metadata (Optional)",
309
+ file_types=[".json", ".txt", ".md"],
310
+ info="Provide expected column types and descriptions to guide data generation"
311
+ )
312
+
313
+ gr.HTML("""
314
+ <div style="background-color: #e8f4fd; padding: 0.8em; border-radius: 6px; border: 1px solid #b3d9ff; margin-top: 0.5em;">
315
+ <h5>📋 Metadata Format Examples:</h5>
316
+ <p><b>JSON format:</b></p>
317
+ <pre style="font-size: 0.8em; background-color: #f8f9fa; padding: 0.5em; border-radius: 4px;">
318
+ {
319
+ "age": {"type": "integer", "description": "Patient age in years"},
320
+ "gender": {"type": "categorical", "description": "Male/Female"},
321
+ "blood_pressure": {"type": "float", "description": "Systolic BP in mmHg"}
322
+ }</pre>
323
+ <p><b>Text format:</b> Simply describe your expected columns and their types.</p>
324
+ </div>
325
+ """)
326
 
327
  with gr.Column(scale=1):
328
  gr.HTML("""
 
332
  <li>Upload healthcare/medical research papers</li>
333
  <li>Ensure PDF contains tables or data descriptions</li>
334
  <li>Clear text (not scanned images) works best</li>
335
+ <li>Upload metadata to specify expected column types</li>
336
  </ul>
337
  </div>
338
  """)
 
469
  # Event handlers
470
  generate_btn.click(
471
  fn=app.process_pdf_and_generate_sample,
472
+ inputs=[pdf_file, metadata_file, llama_key, openrouter_key, model_name, k_chunks],
473
  outputs=[generation_status, sample_data_preview, explanation_output, sample_download],
474
  show_progress=True
475
  ).then(
response_generator.py CHANGED
@@ -25,7 +25,8 @@ class SyntheticDataGenerator:
25
  embedding_model_name: str = "all-MiniLM-L6-v2",
26
  index_path: str = "faiss_index.index",
27
  text_chunks_path: str = "text_chunks.pkl",
28
- max_context_length: int = 8000):
 
29
  """
30
  Initializes the SyntheticDataGenerator.
31
 
@@ -36,6 +37,7 @@ class SyntheticDataGenerator:
36
  index_path: Path to the FAISS index of the paper's text.
37
  text_chunks_path: Path to the pickled text chunks from the paper.
38
  max_context_length: Maximum context length for the LLM prompt.
 
39
  """
40
  # Set up OpenRouter API key
41
  if openai_api_key:
@@ -54,6 +56,7 @@ class SyntheticDataGenerator:
54
  )
55
  self.model_name = model_name
56
  self.max_context_length = max_context_length
 
57
 
58
  # Load embedding model and FAISS index
59
  print("Loading embedding model and FAISS index...")
@@ -125,15 +128,19 @@ The output should be only the synthetic data in CSV format. Give an explanation
125
  what their distribution is.
126
  """
127
 
 
 
 
128
  user_prompt = f"""Based on the following context from a research paper, please generate a synthetic dataset of 100 records.
129
 
130
  Context from the paper:
131
- {context_string}
132
  You have to keep the synthetic data with the similar distribution in all features.
133
  The collinearity between the synthetic data should remain similar to what is mentioned in the paper.
134
  The distribution of the categorical data should be consistent in the data used regarding if it is imbalanced or not.
135
  The distribution of numerical data should be consistent with either uniform distribution or binomial distribution or normal distribution.
136
  This should be applied to each and every feature. If there is a skew in the distribution, you should keep it as it is.
 
137
  Give an explanation on what features have been used in the paper, what their distribution is.
138
  Generate a sample dataset of 100 records of synthetic data for these features.
139
  """
 
25
  embedding_model_name: str = "all-MiniLM-L6-v2",
26
  index_path: str = "faiss_index.index",
27
  text_chunks_path: str = "text_chunks.pkl",
28
+ max_context_length: int = 8000,
29
+ metadata_context: Optional[str] = None):
30
  """
31
  Initializes the SyntheticDataGenerator.
32
 
 
37
  index_path: Path to the FAISS index of the paper's text.
38
  text_chunks_path: Path to the pickled text chunks from the paper.
39
  max_context_length: Maximum context length for the LLM prompt.
40
+ metadata_context: Optional metadata context to guide data generation.
41
  """
42
  # Set up OpenRouter API key
43
  if openai_api_key:
 
56
  )
57
  self.model_name = model_name
58
  self.max_context_length = max_context_length
59
+ self.metadata_context = metadata_context
60
 
61
  # Load embedding model and FAISS index
62
  print("Loading embedding model and FAISS index...")
 
128
  what their distribution is.
129
  """
130
 
131
+ # Build user prompt with optional metadata context
132
+ metadata_section = self.metadata_context if self.metadata_context else ""
133
+
134
  user_prompt = f"""Based on the following context from a research paper, please generate a synthetic dataset of 100 records.
135
 
136
  Context from the paper:
137
+ {context_string}{metadata_section}
138
  You have to keep the synthetic data with the similar distribution in all features.
139
  The collinearity between the synthetic data should remain similar to what is mentioned in the paper.
140
  The distribution of the categorical data should be consistent in the data used regarding if it is imbalanced or not.
141
  The distribution of numerical data should be consistent with either uniform distribution or binomial distribution or normal distribution.
142
  This should be applied to each and every feature. If there is a skew in the distribution, you should keep it as it is.
143
+ {"Please follow the expected data schema provided above when generating the synthetic data." if self.metadata_context else ""}
144
  Give an explanation on what features have been used in the paper, what their distribution is.
145
  Generate a sample dataset of 100 records of synthetic data for these features.
146
  """