xiaoqianran commited on
Commit
d5f197c
·
1 Parent(s): aed8ba9

Add application file

Browse files
Files changed (6) hide show
  1. app copy.py +71 -0
  2. app.py +4 -4
  3. data/shirizhongyan.txt +0 -0
  4. download_hf.py +1 -1
  5. environment_setup.py +31 -31
  6. nltk_setup.py +23 -23
app copy.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llamaindex_rag.py
2
+ import os
3
+ print(os.getenv('INTERNLM_API_KEY'))
4
+
5
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
6
+ from llama_index.core.settings import Settings
7
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
8
+ from llama_index.legacy.callbacks import CallbackManager
9
+ from llama_index.llms.openai_like import OpenAILike
10
+
11
+ # API 密钥和 Base URL 配置 (建议使用环境变量)
12
+ api_key = os.getenv('INTERNLM_API_KEY') or "YOUR_API_KEY_HERE" # 替换为你的 API 密钥 或设置环境变量
13
+ api_base_url = os.environ.get("INTERNLM_BASE_URL") or "https://internlm-chat.intern-ai.org.cn/puyu/api/v1/" # 替换为你的 API Base URL 或设置环境变量
14
+ model_name = "internlm2.5-latest"
15
+
16
+ if api_key == "YOUR_API_KEY_HERE":
17
+ print("警告:请在脚本中或环境变量中配置您的 InternLM API 密钥。")
18
+
19
+
20
+
21
+
22
+ # download_hf.py
23
+ import os
24
+
25
+ # 模型保存路径
26
+ model_dir = "/home/user/app/model/sentence-transformer"
27
+
28
+ # 确保模型目录存在
29
+ os.makedirs(model_dir, exist_ok=True)
30
+
31
+ # 下载模型
32
+ command = f'huggingface-cli download --resume-download sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 --local-dir {model_dir}'
33
+ print(f"Downloading sentence-transformers model to {model_dir}...")
34
+ os.system(command)
35
+ print("Sentence-transformers model download completed.")
36
+
37
+
38
+
39
+
40
+ # Callback 管理器
41
+ callback_manager = CallbackManager()
42
+
43
+ # 初始化 LLM
44
+ llm = OpenAILike(
45
+ model=model_name,
46
+ api_base=api_base_url,
47
+ api_key=api_key,
48
+ is_chat_model=True,
49
+ callback_manager=callback_manager
50
+ )
51
+
52
+ # 初始化 HuggingFace 嵌入模型
53
+ embed_model = HuggingFaceEmbedding(
54
+ model_name="/home/user/app/model/sentence-transformer" # 确保模型已下载到此路径
55
+ )
56
+ Settings.embed_model = embed_model
57
+ Settings.llm = llm
58
+
59
+ # 数据文件路径 (请替换为你的数据文件路径)
60
+ data_dir = "/home/user/app/data" # 假设数据文件在此目录下
61
+ documents = SimpleDirectoryReader(data_dir).load_data()
62
+
63
+ # 构建索引
64
+ index = VectorStoreIndex.from_documents(documents)
65
+ query_engine = index.as_query_engine()
66
+
67
+ # 执行查询
68
+ query_text = "燕知春和江若雪在什么地方认识,她们参加了什么比赛,创立了什么组织?"
69
+ response = query_engine.query(query_text)
70
+
71
+ print(response)
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # llamaindex_rag.py
2
  import os
3
- print(c)
4
 
5
  from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
6
  from llama_index.core.settings import Settings
@@ -23,7 +23,7 @@ if api_key == "YOUR_API_KEY_HERE":
23
  import os
24
 
25
  # 模型保存路径
26
- model_dir = "/teamspace/studios/this_studio/model/sentence-transformer"
27
 
28
  # 确保模型目录存在
29
  os.makedirs(model_dir, exist_ok=True)
@@ -51,13 +51,13 @@ llm = OpenAILike(
51
 
52
  # 初始化 HuggingFace 嵌入模型
53
  embed_model = HuggingFaceEmbedding(
54
- model_name="/teamspace/studios/this_studio/model/sentence-transformer" # 确保模型已下载到此路径
55
  )
56
  Settings.embed_model = embed_model
57
  Settings.llm = llm
58
 
59
  # 数据文件路径 (请替换为你的数据文件路径)
60
- data_dir = "/teamspace/studios/this_studio/data" # 假设数据文件在此目录下
61
  documents = SimpleDirectoryReader(data_dir).load_data()
62
 
63
  # 构建索引
 
1
  # llamaindex_rag.py
2
  import os
3
+ print(os.getenv('INTERNLM_API_KEY'))
4
 
5
  from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
6
  from llama_index.core.settings import Settings
 
23
  import os
24
 
25
  # 模型保存路径
26
+ model_dir = "/home/user/app/model/sentence-transformer"
27
 
28
  # 确保模型目录存在
29
  os.makedirs(model_dir, exist_ok=True)
 
51
 
52
  # 初始化 HuggingFace 嵌入模型
53
  embed_model = HuggingFaceEmbedding(
54
+ model_name="/home/user/app/model/sentence-transformer" # 确保模型已下载到此路径
55
  )
56
  Settings.embed_model = embed_model
57
  Settings.llm = llm
58
 
59
  # 数据文件路径 (请替换为你的数据文件路径)
60
+ data_dir = "/home/user/app/data" # 假设数据文件在此目录下
61
  documents = SimpleDirectoryReader(data_dir).load_data()
62
 
63
  # 构建索引
data/shirizhongyan.txt ADDED
The diff for this file is too large to render. See raw diff
 
download_hf.py CHANGED
@@ -2,7 +2,7 @@
2
  import os
3
 
4
  # 模型保存路径
5
- model_dir = "/teamspace/studios/this_studio/model/sentence-transformer"
6
 
7
  # 确保模型目录存在
8
  os.makedirs(model_dir, exist_ok=True)
 
2
  import os
3
 
4
  # 模型保存路径
5
+ model_dir = "/home/user/app/model/sentence-transformer"
6
 
7
  # 确保模型目录存在
8
  os.makedirs(model_dir, exist_ok=True)
environment_setup.py CHANGED
@@ -1,34 +1,34 @@
1
- # environment_setup.py
2
- import subprocess
3
- import sys
4
- import os
5
 
6
- def install_packages():
7
- packages = [
8
- "einops==0.7.0",
9
- "protobuf==5.26.1",
10
- "llama-index==0.11.20",
11
- "llama-index-llms-replicate==0.3.0",
12
- "llama-index-llms-openai-like==0.2.0",
13
- "llama-index-embeddings-huggingface==0.3.1",
14
- "llama-index-embeddings-instructor==0.2.1",
15
- "torch==2.5.0",
16
- "torchvision==0.20.0",
17
- "torchaudio==2.5.0"
18
- ]
19
- index_url = "https://download.pytorch.org/whl/cu121" # 根据你的 CUDA 版本调整
20
 
21
- for package in packages:
22
- try:
23
- if package.startswith("torch") or package.startswith("torchvision") or package.startswith("torchaudio"):
24
- subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package}", "--index-url", index_url, "-q"])
25
- else:
26
- subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package}", "-q"])
27
- print(f"Successfully installed {package}")
28
- except subprocess.CalledProcessError as e:
29
- print(f"Error installing {package}: {e}")
30
 
31
- if __name__ == "__main__":
32
- print("Starting to install required packages...")
33
- install_packages()
34
- print("Package installation completed.")
 
1
+ # # environment_setup.py
2
+ # import subprocess
3
+ # import sys
4
+ # import os
5
 
6
+ # def install_packages():
7
+ # packages = [
8
+ # "einops==0.7.0",
9
+ # "protobuf==5.26.1",
10
+ # "llama-index==0.11.20",
11
+ # "llama-index-llms-replicate==0.3.0",
12
+ # "llama-index-llms-openai-like==0.2.0",
13
+ # "llama-index-embeddings-huggingface==0.3.1",
14
+ # "llama-index-embeddings-instructor==0.2.1",
15
+ # "torch==2.5.0",
16
+ # "torchvision==0.20.0",
17
+ # "torchaudio==2.5.0"
18
+ # ]
19
+ # index_url = "https://download.pytorch.org/whl/cu121" # 根据你的 CUDA 版本调整
20
 
21
+ # for package in packages:
22
+ # try:
23
+ # if package.startswith("torch") or package.startswith("torchvision") or package.startswith("torchaudio"):
24
+ # subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package}", "--index-url", index_url, "-q"])
25
+ # else:
26
+ # subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package}", "-q"])
27
+ # print(f"Successfully installed {package}")
28
+ # except subprocess.CalledProcessError as e:
29
+ # print(f"Error installing {package}: {e}")
30
 
31
+ # if __name__ == "__main__":
32
+ # print("Starting to install required packages...")
33
+ # install_packages()
34
+ # print("Package installation completed.")
nltk_setup.py CHANGED
@@ -1,30 +1,30 @@
1
- # nltk_setup.py
2
- import os
3
 
4
- nltk_data_path = "/teamspace/studios/this_studio/nltk_data"
5
 
6
- # 克隆 nltk_data 仓库
7
- if not os.path.exists(nltk_data_path):
8
- print(f"Cloning nltk_data to {nltk_data_path}...")
9
- os.system(f'git clone https://github.com/nltk/nltk_data.git --branch gh-pages {nltk_data_path}')
10
- else:
11
- print(f"nltk_data already exists at {nltk_data_path}, skipping clone.")
12
 
13
- # 移动 packages 和解压必要的数据
14
- packages_src = os.path.join(nltk_data_path, 'packages')
15
- tokenizers_dir = os.path.join(nltk_data_path, 'tokenizers')
16
- taggers_dir = os.path.join(nltk_data_path, 'taggers')
17
 
18
- if os.path.exists(packages_src):
19
- print("Moving packages...")
20
- os.system(f'mv {packages_src}/* {nltk_data_path}/')
21
 
22
- if os.path.exists(os.path.join(tokenizers_dir, 'punkt.zip')):
23
- print("Unzipping punkt tokenizer data...")
24
- os.system(f'unzip {os.path.join(tokenizers_dir, "punkt.zip")} -d {tokenizers_dir}')
25
 
26
- if os.path.exists(os.path.join(taggers_dir, 'averaged_perceptron_tagger.zip')):
27
- print("Unzipping averaged_perceptron_tagger data...")
28
- os.system(f'unzip {os.path.join(taggers_dir, "averaged_perceptron_tagger.zip")} -d {taggers_dir}')
29
 
30
- print("NLTK data setup completed.")
 
1
+ # # nltk_setup.py
2
+ # import os
3
 
4
+ # nltk_data_path = "/home/user/app/nltk_data"
5
 
6
+ # # 克隆 nltk_data 仓库
7
+ # if not os.path.exists(nltk_data_path):
8
+ # print(f"Cloning nltk_data to {nltk_data_path}...")
9
+ # os.system(f'git clone https://github.com/nltk/nltk_data.git --branch gh-pages {nltk_data_path}')
10
+ # else:
11
+ # print(f"nltk_data already exists at {nltk_data_path}, skipping clone.")
12
 
13
+ # # 移动 packages 和解压必要的数据
14
+ # packages_src = os.path.join(nltk_data_path, 'packages')
15
+ # tokenizers_dir = os.path.join(nltk_data_path, 'tokenizers')
16
+ # taggers_dir = os.path.join(nltk_data_path, 'taggers')
17
 
18
+ # if os.path.exists(packages_src):
19
+ # print("Moving packages...")
20
+ # os.system(f'mv {packages_src}/* {nltk_data_path}/')
21
 
22
+ # if os.path.exists(os.path.join(tokenizers_dir, 'punkt.zip')):
23
+ # print("Unzipping punkt tokenizer data...")
24
+ # os.system(f'unzip {os.path.join(tokenizers_dir, "punkt.zip")} -d {tokenizers_dir}')
25
 
26
+ # if os.path.exists(os.path.join(taggers_dir, 'averaged_perceptron_tagger.zip')):
27
+ # print("Unzipping averaged_perceptron_tagger data...")
28
+ # os.system(f'unzip {os.path.join(taggers_dir, "averaged_perceptron_tagger.zip")} -d {taggers_dir}')
29
 
30
+ # print("NLTK data setup completed.")