webesama
/

MADRS-BERT

@@ -49,6 +49,33 @@ This model is intended for research and development use. It is not a certified m
 ## 🚀 How to Use
 ### Load model and tokenizer:
 ```python
@@ -61,64 +88,27 @@ model = AutoModelForSequenceClassification.from_pretrained(model_name)
 model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
 ```
-### 📝 Predict on a full structured interview:
 Assume you have a conversation log like this:
 ```python
-conversation_log = [
-    {"Speaker": "Interviewer", "Content": "Wie war Ihr Appetit?", "Topic": "Appetit"},
-    {"Speaker": "Patient", "Content": "Ich hatte guten Appetit.", "Topic": "Appetit"},
-    {"Speaker": "Interviewer", "Content": "Wie war Ihr Schlaf?", "Topic": "Schlaf"},
-    {"Speaker": "Patient", "Content": "Ich konnte gut schlafen.", "Topic": "Schlaf"},
-    # etc.
-]
-topics = ["Traurigkeit", "Anspannung", "Schlaf", "Appetit", "Konzentration", "Antriebslosigkeit", "Gefühlslosigkeit", "Gedanken", "Suizid"]
-```
-Use the prediction function:
-```python
-def predict_scores_per_topic(conversation_log, topics, tokenizer, model):
     device = model.device
     predictions = {}
-    for topic in topics:
-        topic_dialogue = "\n".join(
-            [f"{entry['Speaker']}: {entry['Content']}" for entry in conversation_log if entry["Topic"] == topic]
-        )
-        if not topic_dialogue:
-            predictions[topic] = None
-            continue
-        inputs = tokenizer(topic_dialogue, truncation=True, padding="max_length", max_length=512, return_tensors="pt").to(device)
         with torch.no_grad():
             score = torch.round(model(**inputs).logits).clamp(0, 6).item()
         predictions[topic] = score
-    return predictions
-```
----
-## 🧹 Preprocessing Custom Data
-If you want to prepare your own data (e.g., from JSONL with structure: `User ID`, `Speaker`, `Transcription`, `Topic`, `Score`), use the preprocessing below:
-```python
-from datasets import load_dataset
-dataset = load_dataset("json", data_files="your_data.jsonl", split="train")
-def preprocess_function(examples):
-    scores = [int(float(output.split(":")[1].strip())) for output in examples['output']]
-    topics = [
-        input_text.split("\n")[0].replace("Topic: ", "").strip()
-        if "Topic:" in input_text else "Unknown"
-        for input_text in examples['input']
-    ]
-    encoded = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=512)
-    encoded["labels"] = scores
-    encoded["Topic"] = topics
-    return encoded
-tokenized_dataset = dataset.map(preprocess_function, batched=True)
 ```
 ---

 ## 🚀 How to Use
+### Preprocess Data File:
+Please organize your data equivalent to the example data (synthetic data) with columns: Subject, Speaker, Transcription, Topic, Score.
+```python
+import pandas as pd
+def load_and_prepare_conversations(filepath):
+    df = pd.read_excel(filepath)
+    conversations = []
+    for topic in df['Topic'].unique():
+        topic_df = df[df['Topic'] == topic]
+        if topic_df.empty: continue
+        dialogue = "\n".join([
+            f"{row['Speaker']}: {row['Transcription']}"
+            for _, row in topic_df.iterrows()
+            if pd.notnull(row['Transcription'])
+        ])
+        conversations.append((topic, dialogue))
+    return conversations
+```
 ### Load model and tokenizer:
 ```python
 model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
 ```
+### 📝 Predict on a full structured interview / Run inference:
 Assume you have a conversation log like this:
 ```python
+def predict_madrs_scores(conversations, tokenizer, model):
     device = model.device
     predictions = {}
+    for topic, dialogue in conversations:
+        inputs = tokenizer(dialogue, truncation=True, padding="max_length", max_length=512, return_tensors="pt").to(device)
         with torch.no_grad():
             score = torch.round(model(**inputs).logits).clamp(0, 6).item()
         predictions[topic] = score
+    return predictions
+file_path = "example_interview.xlsx"
+conversations = load_and_prepare_conversations(file_path)
+scores = predict_madrs_scores(conversations, tokenizer, model)
+print(scores)
 ```
 ---