|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
|
import random |
|
|
import os |
|
|
import gradio as gr |
|
|
from datasets import load_dataset |
|
|
from huggingface_hub import HfApi, login, upload_file |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hf_token = os.environ.get("HF_TOKEN", None) |
|
|
if hf_token is None: |
|
|
raise ValueError("HF_TOKEN not found in Secrets. Please add it in Space settings.") |
|
|
login(token=hf_token) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_dataset(): |
|
|
print("Creating a small general dataset...") |
|
|
general_examples = [ |
|
|
{"domain":"general", "context":"Hello, how are you?", "response":"I'm good, thank you!"}, |
|
|
{"domain":"general", "context":"What's your name?", "response":"I'm Derma ChatBot."} |
|
|
] |
|
|
|
|
|
|
|
|
print("Loading Dermatology QA (Mreeb)...") |
|
|
derma = load_dataset("Mreeb/Dermatology-Question-Answer-Dataset-For-Fine-Tuning")['train'] |
|
|
print("Columns in Mreeb dataset:", derma.column_names) |
|
|
|
|
|
derma_examples = [] |
|
|
for item in derma: |
|
|
q = item.get('prompt') |
|
|
a = item.get('response') |
|
|
if q and a: |
|
|
derma_examples.append({"domain":"dermatology","context":q,"response":a}) |
|
|
|
|
|
all_examples = general_examples + derma_examples |
|
|
random.shuffle(all_examples) |
|
|
|
|
|
|
|
|
output_file = "derma_chat_mix.jsonl" |
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
|
for ex in all_examples: |
|
|
f.write(json.dumps(ex, ensure_ascii=False) + "\n") |
|
|
print(f"✅ Dataset saved locally as {output_file} ({len(all_examples)} examples)") |
|
|
|
|
|
|
|
|
repo_id = "username/Derma" |
|
|
api = HfApi() |
|
|
api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True) |
|
|
upload_file( |
|
|
path_or_fileobj=output_file, |
|
|
path_in_repo=output_file, |
|
|
repo_id=repo_id, |
|
|
repo_type="dataset", |
|
|
commit_message="Initial upload of text-based chat dataset" |
|
|
) |
|
|
print(f"✅ Dataset uploaded: https://huggingface.co/datasets/{repo_id}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def simple_chat(user_input): |
|
|
with open("derma_chat_mix.jsonl", 'r', encoding='utf-8') as f: |
|
|
data = [json.loads(line) for line in f] |
|
|
|
|
|
best_match = None |
|
|
max_overlap = 0 |
|
|
for item in data: |
|
|
overlap = len(set(user_input.lower().split()) & set(item['context'].lower().split())) |
|
|
if overlap > max_overlap: |
|
|
max_overlap = overlap |
|
|
best_match = item['response'] |
|
|
|
|
|
if best_match: |
|
|
return best_match |
|
|
else: |
|
|
return "Sorry, I don't have a good answer for that. Try another question!" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=simple_chat, |
|
|
inputs=gr.Textbox(lines=2, placeholder="Ask about dermatology or chat casually..."), |
|
|
outputs=gr.Textbox(label="Derma ChatBot"), |
|
|
title="Derma ChatBot", |
|
|
description="A simple English chatbot combining general conversation + dermatology QA." |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
if not os.path.exists("derma_chat_mix.jsonl"): |
|
|
build_dataset() |
|
|
iface.launch() |
|
|
|