Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,6 +22,8 @@ def load_model():
|
|
| 22 |
|
| 23 |
def generate_embedding(text, tokenizer, model, device):
|
| 24 |
"""Generate embeddings for a given text."""
|
|
|
|
|
|
|
| 25 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| 26 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 27 |
with torch.no_grad():
|
|
@@ -30,14 +32,18 @@ def generate_embedding(text, tokenizer, model, device):
|
|
| 30 |
|
| 31 |
# Load dataset
|
| 32 |
@st.cache_data
|
| 33 |
-
def load_data(
|
| 34 |
dataset = load_dataset("frankjosh/filtered_dataset", split="train")
|
| 35 |
df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# Generate embeddings for each row
|
| 38 |
def compute_embedding(row):
|
| 39 |
-
text = f"{row['docstring']} {row['summary']}"
|
| 40 |
-
return generate_embedding(text,
|
| 41 |
|
| 42 |
df['embedding'] = df.apply(compute_embedding, axis=1)
|
| 43 |
return df
|
|
@@ -46,12 +52,12 @@ def fetch_readme(repo_url):
|
|
| 46 |
"""Fetch README file from GitHub repository."""
|
| 47 |
try:
|
| 48 |
readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
|
| 49 |
-
response = requests.get(readme_url)
|
| 50 |
if response.status_code == 200:
|
| 51 |
return response.text
|
| 52 |
else:
|
| 53 |
return "README not available."
|
| 54 |
-
except
|
| 55 |
return f"Error fetching README: {e}"
|
| 56 |
|
| 57 |
# Main application logic
|
|
@@ -61,38 +67,49 @@ def main():
|
|
| 61 |
|
| 62 |
# Load resources
|
| 63 |
tokenizer, model, device = load_model()
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
# Input user query
|
| 67 |
user_query = st.text_input("Describe your project or learning goal:",
|
| 68 |
"I am working on a project to recommend music using pandas and numpy.")
|
| 69 |
if user_query:
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
if __name__ == "__main__":
|
| 98 |
main()
|
|
|
|
| 22 |
|
| 23 |
def generate_embedding(text, tokenizer, model, device):
|
| 24 |
"""Generate embeddings for a given text."""
|
| 25 |
+
if not text.strip():
|
| 26 |
+
return np.zeros(512) # Handle empty input gracefully
|
| 27 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| 28 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 29 |
with torch.no_grad():
|
|
|
|
| 32 |
|
| 33 |
# Load dataset
|
| 34 |
@st.cache_data
|
| 35 |
+
def load_data(_tokenizer, _model, _device):
|
| 36 |
dataset = load_dataset("frankjosh/filtered_dataset", split="train")
|
| 37 |
df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories
|
| 38 |
|
| 39 |
+
# Fill missing values to avoid errors
|
| 40 |
+
df['docstring'] = df.get('docstring', "").fillna("")
|
| 41 |
+
df['summary'] = df.get('summary', "").fillna("")
|
| 42 |
+
|
| 43 |
# Generate embeddings for each row
|
| 44 |
def compute_embedding(row):
|
| 45 |
+
text = f"{row['docstring']} {row['summary']}"
|
| 46 |
+
return generate_embedding(text, _tokenizer, _model, _device)
|
| 47 |
|
| 48 |
df['embedding'] = df.apply(compute_embedding, axis=1)
|
| 49 |
return df
|
|
|
|
| 52 |
"""Fetch README file from GitHub repository."""
|
| 53 |
try:
|
| 54 |
readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
|
| 55 |
+
response = requests.get(readme_url, timeout=10)
|
| 56 |
if response.status_code == 200:
|
| 57 |
return response.text
|
| 58 |
else:
|
| 59 |
return "README not available."
|
| 60 |
+
except requests.exceptions.RequestException as e:
|
| 61 |
return f"Error fetching README: {e}"
|
| 62 |
|
| 63 |
# Main application logic
|
|
|
|
| 67 |
|
| 68 |
# Load resources
|
| 69 |
tokenizer, model, device = load_model()
|
| 70 |
+
|
| 71 |
+
with st.spinner("Loading dataset and generating embeddings. This may take a moment..."):
|
| 72 |
+
try:
|
| 73 |
+
data = load_data(tokenizer, model, device)
|
| 74 |
+
except Exception as e:
|
| 75 |
+
st.error(f"Error loading dataset: {e}")
|
| 76 |
+
return
|
| 77 |
|
| 78 |
# Input user query
|
| 79 |
user_query = st.text_input("Describe your project or learning goal:",
|
| 80 |
"I am working on a project to recommend music using pandas and numpy.")
|
| 81 |
if user_query:
|
| 82 |
+
with st.spinner("Processing your query..."):
|
| 83 |
+
query_embedding = generate_embedding(user_query, tokenizer, model, device)
|
| 84 |
+
|
| 85 |
+
# Compute similarity
|
| 86 |
+
try:
|
| 87 |
+
data['similarity'] = data['embedding'].apply(
|
| 88 |
+
lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Filter and sort recommendations
|
| 92 |
+
top_recommendations = (
|
| 93 |
+
data.sort_values(by='similarity', ascending=False)
|
| 94 |
+
.head(5)
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Display recommendations
|
| 98 |
+
st.subheader("Top Recommendations")
|
| 99 |
+
for idx, row in top_recommendations.iterrows():
|
| 100 |
+
st.markdown(f"### {row['repo']}")
|
| 101 |
+
st.write(f"**Path:** {row['path']}")
|
| 102 |
+
st.write(f"**Summary:** {row['summary']}")
|
| 103 |
+
st.write(f"**Similarity Score:** {row['similarity']:.2f}")
|
| 104 |
+
st.markdown(f"[Repository Link]({row['url']})")
|
| 105 |
+
|
| 106 |
+
# Fetch and display README
|
| 107 |
+
st.subheader("Repository README")
|
| 108 |
+
readme_content = fetch_readme(row['url'])
|
| 109 |
+
st.code(readme_content)
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
st.error(f"Error computing recommendations: {e}")
|
| 113 |
|
| 114 |
if __name__ == "__main__":
|
| 115 |
main()
|