Mahmous commited on
Commit
de75945
·
verified ·
1 Parent(s): 8a2c597

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -0
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+ from flask import Flask, request, jsonify
4
+ from flask_cors import CORS
5
+ from dotenv import load_dotenv
6
+ from openai import OpenAI
7
+ from langdetect import detect
8
+ from deep_translator import GoogleTranslator
9
+ import subprocess
10
+
11
+ # Patch huggingface_hub automatically if Gradio overwrote it
12
+ try:
13
+ import huggingface_hub
14
+ if not hasattr(huggingface_hub, "cached_download"):
15
+ subprocess.run(
16
+ ["pip", "install", "--no-cache-dir", "huggingface-hub==0.24.5", "transformers==4.30.2", "sentence-transformers==2.2.2"],
17
+ check=True
18
+ )
19
+ print("✅ Downgraded huggingface-hub for sentence-transformers compatibility.")
20
+ except Exception as e:
21
+ print("⚠️ Could not auto-patch huggingface_hub:", e)
22
+ from sentence_transformers import SentenceTransformer
23
+ from pinecone import Pinecone
24
+
25
+ # ---------- Config ----------
26
+ DATASET_PATH = "data/coaching_millionaer_dataset.json"
27
+ load_dotenv()
28
+
29
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
30
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") # Add this to your .env
31
+ PINECONE_INDEX_NAME = "ebook"
32
+
33
+ # ---------- App ----------
34
+ app = Flask(__name__)
35
+ CORS(app, resources={r"/ask": {"origins": "*"}})
36
+
37
+ # ---------- OpenAI Client ----------
38
+ client = None
39
+ if OPENAI_API_KEY:
40
+ client = OpenAI(api_key=OPENAI_API_KEY)
41
+ else:
42
+ print("⚠️ OPENAI_API_KEY is missing in .env")
43
+
44
+ # ---------- Retriever ----------
45
+ retriever = None
46
+ try:
47
+ if not PINECONE_API_KEY:
48
+ raise ValueError("PINECONE_API_KEY missing in .env")
49
+
50
+ pc = Pinecone(api_key=PINECONE_API_KEY)
51
+ index = pc.Index(PINECONE_INDEX_NAME)
52
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
53
+
54
+ class PineconeRetriever:
55
+ def __init__(self, index, embedder):
56
+ self.index = index
57
+ self.embedder = embedder
58
+
59
+ def retrieve(self, query, top_k=10):
60
+ emb = self.embedder.encode(query).tolist()
61
+ res = self.index.query(vector=emb, top_k=top_k, include_metadata=True)
62
+ matches = res.get("matches", [])
63
+ results = []
64
+ for match in matches:
65
+ meta = match.get("metadata", {})
66
+ results.append({
67
+ "context": meta.get("context", ""),
68
+ "page": meta.get("page"),
69
+ "score": match.get("score", 0)
70
+ })
71
+ return results
72
+
73
+ retriever = PineconeRetriever(index, embedder)
74
+ print("✅ Pinecone retriever initialized successfully.")
75
+ except Exception as e:
76
+ print("❌ Retriever initialization failed:", e)
77
+ traceback.print_exc()
78
+
79
+ # ---------- Translator ----------
80
+ def translate_text(text: str, target_lang: str) -> str:
81
+ """Translate text using deep-translator (GoogleTranslator)."""
82
+ try:
83
+ return GoogleTranslator(source="auto", target=target_lang).translate(text)
84
+ except Exception:
85
+ return text
86
+
87
+ # ---------- Helpers ----------
88
+ def detect_language(question: str) -> str:
89
+ """Detect the user's language without translation."""
90
+ try:
91
+ return detect(question)
92
+ except Exception:
93
+ return "unknown"
94
+
95
+ def normalize_language(lang: str, text: str) -> str:
96
+ """Fix incorrect language detection like 'wer is' → German."""
97
+ if lang == "nl" and any(word in text.lower() for word in ["wer", "was", "wie", "javid", "coaching"]):
98
+ return "de"
99
+ return lang
100
+
101
+ def system_prompt_book_only() -> str:
102
+ return (
103
+ "You are CoachingBot, a professional mentor trained on the book 'Coaching Millionär' by Javid Niazi-Hoffmann. "
104
+ "Use only the provided book context to answer the question. "
105
+ "If the user asks about people like Javid Niazi-Hoffmann, describe them factually using the book content. "
106
+ "Mention page numbers where possible. "
107
+ "If the context is not relevant, say you don’t have that information in the book and provide a general, helpful answer. "
108
+ "Always respond in the same language as the user's question, even if the book content is in another language."
109
+ )
110
+
111
+ def system_prompt_fallback() -> str:
112
+ return (
113
+ "You are CoachingBot, a helpful business and life mentor. "
114
+ "The question cannot be answered from the book, so answer using your general coaching knowledge. "
115
+ "Always respond in the same language as the user's question, even if the book content is in another language. "
116
+ "Do not invent book citations."
117
+ )
118
+
119
+ def format_answers(question: str, answer: str, results):
120
+ pages = [f"Seite {r.get('page', '')}" for r in results if r.get("page")]
121
+ source = ", ".join(pages) if pages else "No source"
122
+ top_score = max([r.get("score", 0.0) for r in results], default=0.0)
123
+ return {"answers": [{"question": question, "answer": answer, "source": source, "bm25_score": top_score}]}
124
+
125
+ # ---------- Routes ----------
126
+ @app.route("/", methods=["GET"])
127
+ def health():
128
+ return jsonify({
129
+ "status": "running",
130
+ "retriever_ready": bool(retriever),
131
+ "openai_key_loaded": bool(OPENAI_API_KEY),
132
+ "pinecone_key_loaded": bool(PINECONE_API_KEY),
133
+ "index_name": PINECONE_INDEX_NAME
134
+ })
135
+
136
+ @app.route("/ask", methods=["POST", "OPTIONS"])
137
+ def ask():
138
+ if request.method == "OPTIONS":
139
+ return ("", 204)
140
+
141
+ try:
142
+ data = request.get_json(force=True) or {}
143
+ question = (data.get("question") or "").strip()
144
+ except Exception:
145
+ return jsonify(format_answers("", "Invalid JSON request", [])), 200
146
+
147
+ if not question:
148
+ return jsonify(format_answers("", "Please enter a question.", [])), 200
149
+
150
+ print(f"\n--- User Question ---\n{question}")
151
+
152
+ # Detect and normalize language
153
+ user_lang = normalize_language(detect_language(question), question)
154
+ print(f"Detected language: {user_lang}")
155
+
156
+ # Retrieve context
157
+ context, results = "", []
158
+ try:
159
+ raw_results = retriever.retrieve(question)
160
+ MIN_SCORE = 0.10 # Pinecone similarity scores are normalized (0–1)
161
+ results = [r for r in raw_results if r.get("score", 0) >= MIN_SCORE]
162
+ if results:
163
+ context = "\n\n---\n\n".join(
164
+ [f"(Seite {r['page']}) {r['context']}" for r in results]
165
+ )
166
+ except Exception as e:
167
+ traceback.print_exc()
168
+ return jsonify(format_answers(question, f"Retriever error: {e}", [])), 200
169
+
170
+ # Build prompts
171
+ if context:
172
+ sys_prompt = system_prompt_book_only()
173
+ user_content = f"Question: {question}\n\nBook context:\n{context}"
174
+ else:
175
+ sys_prompt = system_prompt_fallback()
176
+ user_content = question
177
+
178
+ # Query GPT
179
+ try:
180
+ response = client.chat.completions.create(
181
+ model="gpt-4o-mini",
182
+ messages=[
183
+ {"role": "system", "content": sys_prompt},
184
+ {"role": "user", "content": user_content}
185
+ ],
186
+ max_tokens=700,
187
+ )
188
+ answer = response.choices[0].message.content.strip()
189
+ except Exception as e:
190
+ traceback.print_exc()
191
+ return jsonify(format_answers(question, f"⚠️ OpenAI call failed: {e}", [])), 200
192
+
193
+ return jsonify(format_answers(question, answer, results))
194
+
195
+ # ---------- Run ----------
196
+ if __name__ == "__main__":
197
+ port = int(os.environ.get("PORT", 7860))
198
+ print(f"🚀 Server started on port {port}")
199
+ app.run(host="0.0.0.0", port=port)