MojoHz commited on
Commit
8ad2c40
·
verified ·
1 Parent(s): d4e7435

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -0
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install necessary libraries
2
+ !pip install -U langchain-community
3
+ !pip install yt-dlp
4
+ !pip install langchain sentence-transformers faiss-gpu pypdf transformers youtube-search-python arxiv requests scikit-learn
5
+
6
+ # Import libraries
7
+ import os
8
+ import requests
9
+ import re
10
+ from yt_dlp import YoutubeDL
11
+ from langchain.document_loaders import PyPDFLoader
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from sentence_transformers import SentenceTransformer
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ from transformers import AutoTokenizer, AutoModelForCausalLM
16
+ from huggingface_hub import login
17
+ import arxiv
18
+ import numpy as np
19
+
20
+ # Access the Hugging Face token from the environment variable
21
+ HF_TOKEN = os.getenv("HF_Token")
22
+ login(token=HF_TOKEN)
23
+
24
+ # Initialize the embedding model
25
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
26
+
27
+ # Define paths for downloaded content and database
28
+ file_paths = {
29
+ "video": "./Machine Learning.mp4", # Replace with actual paths
30
+ "paper": "./L35501081219.pdf",
31
+ }
32
+ download_path = "./downloads"
33
+ papers_path = "./papers"
34
+ os.makedirs(download_path, exist_ok=True)
35
+ os.makedirs(papers_path, exist_ok=True)
36
+
37
+ # Load LLaMA 2
38
+ model_name = "meta-llama/Llama-2-7b-chat-hf"
39
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
40
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
41
+
42
+ # Define utility functions
43
+ def compute_similarity(query_embedding, content_embeddings):
44
+ """Compute cosine similarity between query and content embeddings."""
45
+ similarities = cosine_similarity([query_embedding], content_embeddings).flatten()
46
+ return similarities
47
+
48
+ def add_local_files(module):
49
+ """Add local files from the database to the metadata."""
50
+ if module not in file_paths:
51
+ return []
52
+ file_path = file_paths[module]
53
+ if module == "video":
54
+ return [{"title": os.path.basename(file_path), "url": None, "file_path": file_path, "type": "video"}]
55
+ elif module == "paper":
56
+ return [{"title": os.path.basename(file_path), "url": None, "file_path": file_path, "type": "paper"}]
57
+
58
+ def download_youtube_video(video_url, output_dir, title=None):
59
+ """Download a YouTube video using yt_dlp."""
60
+ sanitized_title = re.sub(r'[\\/*?:"<>|]', '_', title) if title else None
61
+ ydl_opts = {
62
+ 'quiet': True,
63
+ 'outtmpl': f"{output_dir}/{sanitized_title}.%(ext)s" if sanitized_title else f"{output_dir}/%(title)s.%(ext)s",
64
+ 'format': 'best',
65
+ }
66
+ try:
67
+ with YoutubeDL(ydl_opts) as ydl:
68
+ ydl.download([video_url])
69
+ return os.path.join(output_dir, f"{sanitized_title}.mp4")
70
+ except Exception as e:
71
+ print(f"Failed to download video {video_url}. Error: {e}")
72
+ return None
73
+
74
+ def fetch_and_download_youtube_video(query, output_dir="./downloads"):
75
+ """Fetch and download the best YouTube video for a query."""
76
+ ydl_opts = {
77
+ 'quiet': True,
78
+ 'noplaylist': True,
79
+ 'default_search': 'ytsearch',
80
+ 'max_downloads': 1,
81
+ 'skip_download': True,
82
+ }
83
+ try:
84
+ with YoutubeDL(ydl_opts) as ydl:
85
+ search_results = ydl.extract_info(query, download=False)
86
+ video = search_results['entries'][0] # Get the first result
87
+ video_title = video['title']
88
+ video_url = video['webpage_url']
89
+ local_path = download_youtube_video(video_url, output_dir, title=video_title)
90
+ return [{"title": video_title, "url": video_url, "file_path": local_path, "type": "video"}]
91
+ except Exception as e:
92
+ print(f"Error fetching YouTube video for query '{query}': {e}")
93
+ return []
94
+
95
+ def fetch_from_arxiv(query="machine learning", max_results=2, output_dir="./papers"):
96
+ """Fetch papers from arXiv and download their PDFs."""
97
+ search = arxiv.Search(
98
+ query=query,
99
+ max_results=max_results,
100
+ sort_by=arxiv.SortCriterion.Relevance
101
+ )
102
+ metadata = []
103
+ for i, result in enumerate(search.results()):
104
+ pdf_url = result.pdf_url
105
+ filename = f"{query.replace(' ', '_')}_arxiv_{i}.pdf"
106
+ local_path = os.path.join(output_dir, filename)
107
+ try:
108
+ response = requests.get(pdf_url)
109
+ if response.status_code == 200:
110
+ with open(local_path, 'wb') as f:
111
+ f.write(response.content)
112
+ metadata.append({"title": result.title, "url": pdf_url, "file_path": local_path, "type": "paper"})
113
+ except Exception as e:
114
+ print(f"Error downloading paper: {e}")
115
+ return metadata
116
+
117
+ def generate_llama_response(query, context=None):
118
+ """Generate a response using LLaMA 2."""
119
+ input_text = f"Query: {query}\n"
120
+ if context:
121
+ input_text += f"Context: {context}\n"
122
+ input_text += "Answer:"
123
+ inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
124
+ outputs = model.generate(inputs["input_ids"], max_length=500, temperature=0.7)
125
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
126
+ return response
127
+
128
+ def hybrid_rag_system_with_llama(query):
129
+ """Use LLaMA 2 to generate a final response after retrieving the best video and paper."""
130
+ modules = ["video", "paper"]
131
+ final_results = {}
132
+ query_embedding = embedding_model.encode(query)
133
+
134
+ for module in modules:
135
+ metadata = []
136
+ metadata.extend(add_local_files(module))
137
+ if module == "video":
138
+ metadata.extend(fetch_and_download_youtube_video(query, output_dir=download_path))
139
+ elif module == "paper":
140
+ metadata.extend(fetch_from_arxiv(query, max_results=2, output_dir=papers_path))
141
+ if metadata:
142
+ descriptions = [f"{item['title']} ({item['type']})" for item in metadata]
143
+ description_embeddings = [embedding_model.encode(description) for description in descriptions]
144
+ similarities = compute_similarity(query_embedding, description_embeddings)
145
+ for idx, item in enumerate(metadata):
146
+ item["similarity"] = similarities[idx]
147
+ best_match_idx = np.argmax(similarities)
148
+ final_results[module] = {
149
+ "best_match": metadata[best_match_idx],
150
+ "similarity": similarities[best_match_idx],
151
+ "all_metadata": metadata,
152
+ }
153
+ else:
154
+ final_results[module] = {"best_match": None, "similarity": None, "all_metadata": []}
155
+ video_context = f"Best Video: {final_results['video']['best_match']['title']}" if final_results['video']['best_match'] else "No relevant video found."
156
+ paper_context = f"Best Paper: {final_results['paper']['best_match']['title']}" if final_results['paper']['best_match'] else "No relevant paper found."
157
+ context = f"{video_context}\n{paper_context}"
158
+ final_response = generate_llama_response(query, context)
159
+ return final_results, final_response
160
+
161
+ # Example query
162
+ query = "short easy machine learning"
163
+ results, final_response = hybrid_rag_system_with_llama(query)
164
+ print("\nFinal Response Generated by LLaMA 2:")
165
+ print(final_response)