Shirjannn commited on
Commit
1cbb046
·
verified ·
1 Parent(s): cdaa3b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -55
app.py CHANGED
@@ -1,85 +1,117 @@
1
  # ===============================
2
- # Derma Space: Dataset Prep & HF Upload
3
  # ===============================
4
 
5
  import json
6
  import random
7
  import os
 
8
  from datasets import load_dataset
9
  from huggingface_hub import HfApi, login, upload_file
10
 
11
  # ------------------------
12
  # 1️⃣ ورود با Secret
13
  # ------------------------
14
- hf_token = os.environ["HF_TOKEN"] # توکن باید در Secrets Space اضافه شده باشد
15
  login(token=hf_token)
16
 
17
  # ------------------------
18
- # 2️⃣ بارگذاری دیتاست عمومی
19
  # ------------------------
20
- print("Loading DailyDialog...")
21
- dd = load_dataset("daily_dialog")['train']
22
- dd_examples = [{"domain":"general","context":conv[i],"response":conv[i+1]}
23
- for conv in dd['dialog'] for i in range(len(conv)-1)]
 
24
 
25
- print("Loading Persona-Chat...")
26
- pc = load_dataset("persona_chat", "self_original")['train']
27
- pc_examples = [{"domain":"general","context":conv['utterances'][i]['text'],
28
- "response":conv['utterances'][i+1]['text']}
29
- for conv in pc for i in range(len(conv['utterances'])-1)]
30
 
31
- general_examples = dd_examples + pc_examples
32
- random.shuffle(general_examples)
33
- general_examples = general_examples[:5000] # نمونه‌گیری
34
 
35
- # ------------------------
36
- # 3️⃣ دیتاست تخصصی درماتولوژی
37
- # ------------------------
38
- print("Loading Dermatology QA (Mreeb)...")
39
- derma = load_dataset("Mreeb/Dermatology-Question-Answer-Dataset-For-Fine-Tuning")['train']
40
- derma_examples = [{"domain":"dermatology","context":item['question'],"response":item['answer']}
41
- for item in derma]
42
 
43
- print("Loading MedQuAD...")
44
- medquad = load_dataset("pythonafroz/MedQuAD")['train']
45
- derma_keywords = ["skin", "eczema", "psoriasis", "dermatitis", "melanoma", "acne", "rash"]
46
- medquad_derma = [{"domain":"dermatology","context":item['question'],"response":item['answer']}
47
- for item in medquad if any(k in item['question'].lower() for k in derma_keywords)]
48
- random.shuffle(medquad_derma)
49
- medquad_derma = medquad_derma[:500]
50
 
51
- dermatology_examples = derma_examples + medquad_derma
52
- random.shuffle(dermatology_examples)
53
 
54
- # ------------------------
55
- # 4️⃣ ترکیب نهایی
56
- # ------------------------
57
- all_examples = general_examples + dermatology_examples
58
- random.shuffle(all_examples)
59
 
60
- # ------------------------
61
- # 5️⃣ ذخیره به JSONL
62
- # ------------------------
63
- output_file = "derma_chat_mix.jsonl"
64
- with open(output_file, 'w', encoding='utf-8') as f:
65
- for ex in all_examples:
66
- f.write(json.dumps(ex, ensure_ascii=False) + "\n")
67
 
68
- print(f"✅ Dataset saved locally as {output_file} ({len(all_examples)} examples)")
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  # ------------------------
71
- # 6️⃣ آپلود به Hugging Face Hub
72
  # ------------------------
73
- repo_id = "username/Derma" # نام کاربری خودت + نام Space
74
- api = HfApi()
75
- api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- upload_file(
78
- path_or_fileobj=output_file,
79
- path_in_repo=output_file,
80
- repo_id=repo_id,
81
- repo_type="dataset",
82
- commit_message="Initial upload of text-based chat dataset"
 
 
 
83
  )
84
 
85
- print(f"✅ Dataset uploaded to Hugging Face Hub: https://huggingface.co/datasets/{repo_id}")
 
 
 
 
 
 
 
 
 
1
  # ===============================
2
+ # Derma Space: Dataset + Gradio Chatbot
3
  # ===============================
4
 
5
  import json
6
  import random
7
  import os
8
+ import gradio as gr
9
  from datasets import load_dataset
10
  from huggingface_hub import HfApi, login, upload_file
11
 
12
  # ------------------------
13
  # 1️⃣ ورود با Secret
14
  # ------------------------
15
+ hf_token = os.environ["HF_TOKEN"]
16
  login(token=hf_token)
17
 
18
  # ------------------------
19
+ # 2️⃣ ساخت دیتاست ترکیبی
20
  # ------------------------
21
+ def build_dataset():
22
+ print("Loading DailyDialog...")
23
+ dd = load_dataset("daily_dialog")['train']
24
+ dd_examples = [{"domain":"general","context":conv[i],"response":conv[i+1]}
25
+ for conv in dd['dialog'] for i in range(len(conv)-1)]
26
 
27
+ print("Loading Persona-Chat...")
28
+ pc = load_dataset("persona_chat", "self_original")['train']
29
+ pc_examples = [{"domain":"general","context":conv['utterances'][i]['text'],
30
+ "response":conv['utterances'][i+1]['text']}
31
+ for conv in pc for i in range(len(conv['utterances'])-1)]
32
 
33
+ general_examples = dd_examples + pc_examples
34
+ random.shuffle(general_examples)
35
+ general_examples = general_examples[:5000]
36
 
37
+ print("Loading Dermatology QA (Mreeb)...")
38
+ derma = load_dataset("Mreeb/Dermatology-Question-Answer-Dataset-For-Fine-Tuning")['train']
39
+ derma_examples = [{"domain":"dermatology","context":item['question'],"response":item['answer']}
40
+ for item in derma]
 
 
 
41
 
42
+ print("Loading MedQuAD...")
43
+ medquad = load_dataset("pythonafroz/MedQuAD")['train']
44
+ derma_keywords = ["skin", "eczema", "psoriasis", "dermatitis", "melanoma", "acne", "rash"]
45
+ medquad_derma = [{"domain":"dermatology","context":item['question'],"response":item['answer']}
46
+ for item in medquad if any(k in item['question'].lower() for k in derma_keywords)]
47
+ random.shuffle(medquad_derma)
48
+ medquad_derma = medquad_derma[:500]
49
 
50
+ dermatology_examples = derma_examples + medquad_derma
51
+ random.shuffle(dermatology_examples)
52
 
53
+ all_examples = general_examples + dermatology_examples
54
+ random.shuffle(all_examples)
 
 
 
55
 
56
+ # ذخیره به JSONL
57
+ output_file = "derma_chat_mix.jsonl"
58
+ with open(output_file, 'w', encoding='utf-8') as f:
59
+ for ex in all_examples:
60
+ f.write(json.dumps(ex, ensure_ascii=False) + "\n")
61
+ print(f"✅ Dataset saved locally as {output_file} ({len(all_examples)} examples)")
 
62
 
63
+ # آپلود به HF
64
+ repo_id = "username/Derma" # نام کاربری خودت + نام Space
65
+ api = HfApi()
66
+ api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
67
+ upload_file(
68
+ path_or_fileobj=output_file,
69
+ path_in_repo=output_file,
70
+ repo_id=repo_id,
71
+ repo_type="dataset",
72
+ commit_message="Initial upload of text-based chat dataset"
73
+ )
74
+ print(f"✅ Dataset uploaded: https://huggingface.co/datasets/{repo_id}")
75
 
76
  # ------------------------
77
+ # 3️⃣ چت ساده با Gradio
78
  # ------------------------
79
+ def simple_chat(user_input):
80
+ # جستجو در دیتاست برای پاسخ نزدیک (ساده)
81
+ with open("derma_chat_mix.jsonl", 'r', encoding='utf-8') as f:
82
+ data = [json.loads(line) for line in f]
83
+
84
+ # جستجوی پاسخ بر اساس کلمات مشترک
85
+ best_match = None
86
+ max_overlap = 0
87
+ for item in data:
88
+ overlap = len(set(user_input.lower().split()) & set(item['context'].lower().split()))
89
+ if overlap > max_overlap:
90
+ max_overlap = overlap
91
+ best_match = item['response']
92
+
93
+ if best_match:
94
+ return best_match
95
+ else:
96
+ return "Sorry, I don't have a good answer for that. Try another question!"
97
 
98
+ # ------------------------
99
+ # 4️⃣ راه‌اندازی Gradio
100
+ # ------------------------
101
+ iface = gr.Interface(
102
+ fn=simple_chat,
103
+ inputs=gr.Textbox(lines=2, placeholder="Ask about dermatology or chat casually..."),
104
+ outputs=gr.Textbox(label="Derma ChatBot"),
105
+ title="Derma ChatBot",
106
+ description="A simple English chatbot combining general conversation + dermatology QA."
107
  )
108
 
109
+ # ------------------------
110
+ # 5️⃣ اجرای دیتاست + رابط
111
+ # ------------------------
112
+ if __name__ == "__main__":
113
+ # فقط یک بار دیتاست بساز
114
+ if not os.path.exists("derma_chat_mix.jsonl"):
115
+ build_dataset()
116
+
117
+ iface.launch()