Adi362 commited on
Commit
ec4e184
·
verified ·
1 Parent(s): 6fdebfe

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from qdrant_client import QdrantClient
3
+ from sentence_transformers import SentenceTransformer
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ import torch
6
+ import os
7
+
8
+ QDRANT_URL = os.environ.get("QDRANT_URL")
9
+ QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
10
+ COLLECTION_NAME = "well_vectors"
11
+
12
+ QWEN_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
13
+
14
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
15
+
16
+ client = QdrantClient(
17
+ url=QDRANT_URL,
18
+ api_key=QDRANT_API_KEY
19
+ )
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL)
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ QWEN_MODEL,
24
+ torch_dtype=torch.float16,
25
+ device_map="auto"
26
+ )
27
+
28
+ def scientific_query_api(question: str):
29
+
30
+ qvec = embedder.encode(question, normalize_embeddings=True)
31
+
32
+ concepts = client.search(
33
+ collection_name=COLLECTION_NAME,
34
+ query_vector=qvec,
35
+ filter={
36
+ "must": [{"key": "type", "match": {"value": "concept"}}]
37
+ },
38
+ limit=1
39
+ )
40
+
41
+ if not concepts:
42
+ return {
43
+ "question": question,
44
+ "answer": "No indexed scientific material is available for this query.",
45
+ "sources": [],
46
+ "confidence": "insufficient_data"
47
+ }
48
+
49
+ concept = concepts[0]
50
+
51
+ evidence = client.search(
52
+ collection_name=COLLECTION_NAME,
53
+ query_vector=concept.vector,
54
+ limit=5
55
+ )
56
+
57
+ packet = []
58
+ packet.append("Concept definition:")
59
+ packet.append(concept.payload["content"])
60
+ packet.append("\nScientific context from indexed data:")
61
+
62
+ sources = set(["Curated physics concepts"])
63
+
64
+ for e in evidence:
65
+ if "dataset" in e.payload:
66
+ packet.append(
67
+ f"- Dataset: {e.payload['dataset']}, File: {e.payload.get('file','')}"
68
+ )
69
+ sources.add(f"The Well: {e.payload['dataset']}")
70
+
71
+ evidence_text = "\n".join(packet)
72
+
73
+ prompt = f"""
74
+ You are a scientific formatter.
75
+
76
+ Rules:
77
+ - Use ONLY the information below.
78
+ - Do NOT add facts or interpretations.
79
+ - Preserve scientific meaning.
80
+
81
+ INFORMATION:
82
+ {evidence_text}
83
+ """
84
+
85
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
86
+ outputs = model.generate(
87
+ **inputs,
88
+ max_new_tokens=300,
89
+ do_sample=False
90
+ )
91
+
92
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
93
+
94
+ return {
95
+ "question": question,
96
+ "answer": answer.strip(),
97
+ "sources": sorted(sources),
98
+ "confidence": "grounded"
99
+ }
100
+
101
+ iface = gr.Interface(
102
+ fn=scientific_query_api,
103
+ inputs=gr.Textbox(label="Scientific Question"),
104
+ outputs="json",
105
+ allow_flagging="never"
106
+ )
107
+
108
+ iface.launch(server_name="0.0.0.0", server_port=7860)