kinely commited on
Commit
ebf5ff6
·
verified ·
1 Parent(s): c503e77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -40
app.py CHANGED
@@ -3,22 +3,57 @@ from transformers import T5ForConditionalGeneration, T5Tokenizer
3
  from sentence_transformers import SentenceTransformer
4
  import faiss
5
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # Load model and tokenizer
8
  model_name = "google/flan-t5-base"
9
  model = T5ForConditionalGeneration.from_pretrained(model_name)
10
  tokenizer = T5Tokenizer.from_pretrained(model_name)
11
 
12
- # Define your sentence transformer model for the RAG approach
13
- embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
14
-
15
- # Build FAISS index (assuming you have precomputed embeddings for your retrieval corpus)
16
- # embeddings = ... # Your precomputed embeddings go here
17
- # faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
18
- # faiss_index.add(embeddings)
19
-
20
- # Define the Streamlit interface
21
- st.title("Humanized Text Generator")
22
 
23
  # Text input from the user
24
  user_input = st.text_area("Enter your query here", max_chars=2000)
@@ -28,14 +63,16 @@ if st.button("Generate Humanized Text"):
28
  if user_input:
29
  # Convert user input to embedding for retrieval
30
  query_embedding = embedder.encode([user_input], convert_to_tensor=True)
31
-
32
- # Retrieve the top k related documents from your FAISS index
33
- # _, top_k_indices = faiss_index.search(query_embedding.cpu().numpy(), k=5)
34
-
35
- # Dummy document context (replace this with actual retrieved docs)
36
- # context = retrieve_documents(top_k_indices)
37
- context = "Sample context related to the query." # For demonstration
38
-
 
 
39
  # Concatenate query and context
40
  input_text = f"{user_input} {context}"
41
 
@@ -50,25 +87,3 @@ if st.button("Generate Humanized Text"):
50
  st.write(generated_text)
51
  else:
52
  st.write("Please enter a query.")
53
-
54
- import faiss
55
- import numpy as np
56
-
57
- # Load your corpus embeddings
58
- # embeddings = np.load("embeddings.npy")
59
-
60
- # Initialize FAISS index and add the embeddings
61
- faiss_index = faiss.IndexFlatL2(embeddings.shape[1]) # Use L2 distance
62
- faiss_index.add(embeddings)
63
-
64
- # When you have a query, encode it and retrieve the top documents
65
- query_embedding = embedder.encode([user_input], convert_to_tensor=True)
66
- _, top_k_indices = faiss_index.search(query_embedding.cpu().numpy(), k=5)
67
-
68
- # Retrieve documents based on the top_k_indices
69
- def retrieve_documents(top_k_indices):
70
- # Here, you would map the indices to the actual documents in your corpus
71
- # This is just a placeholder
72
- documents = ["Doc 1", "Doc 2", "Doc 3", "Doc 4", "Doc 5"]
73
- return " ".join([documents[i] for i in top_k_indices[0]])
74
-
 
3
  from sentence_transformers import SentenceTransformer
4
  import faiss
5
  import torch
6
+ import numpy as np
7
+ import wikipediaapi
8
+
9
+ # Initialize Wikipedia API
10
+ wiki_wiki = wikipediaapi.Wikipedia('en')
11
+
12
+ # Function to fetch content from Wikipedia
13
+ def fetch_wikipedia_articles(titles):
14
+ corpus = []
15
+ for title in titles:
16
+ page = wiki_wiki.page(title)
17
+ if page.exists():
18
+ corpus.append(page.text)
19
+ else:
20
+ st.write(f"Page for '{title}' does not exist.")
21
+ return corpus
22
+
23
+ # Initialize SentenceTransformer for embeddings
24
+ embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
25
+
26
+ # List of Wikipedia articles to retrieve
27
+ titles = [
28
+ "Crypto",
29
+ "Finance",
30
+ "Technology",
31
+ "Healthcare",
32
+ "Education"
33
+ ]
34
+
35
+ # Fetch and create the corpus
36
+ st.write("Fetching Wikipedia articles...")
37
+ corpus = fetch_wikipedia_articles(titles)
38
+
39
+ # Generate embeddings for the corpus
40
+ st.write("Generating embeddings...")
41
+ embeddings = embedder.encode(corpus, convert_to_tensor=True)
42
+
43
+ # Convert embeddings to NumPy array
44
+ embeddings_np = embeddings.cpu().numpy()
45
+
46
+ # Initialize FAISS index and add embeddings
47
+ faiss_index = faiss.IndexFlatL2(embeddings_np.shape[1])
48
+ faiss_index.add(embeddings_np)
49
 
50
  # Load model and tokenizer
51
  model_name = "google/flan-t5-base"
52
  model = T5ForConditionalGeneration.from_pretrained(model_name)
53
  tokenizer = T5Tokenizer.from_pretrained(model_name)
54
 
55
+ # Streamlit interface
56
+ st.title("Humanized AI Text Generator")
 
 
 
 
 
 
 
 
57
 
58
  # Text input from the user
59
  user_input = st.text_area("Enter your query here", max_chars=2000)
 
63
  if user_input:
64
  # Convert user input to embedding for retrieval
65
  query_embedding = embedder.encode([user_input], convert_to_tensor=True)
66
+
67
+ # Retrieve top 5 related documents from FAISS index
68
+ _, top_k_indices = faiss_index.search(query_embedding.cpu().numpy(), k=5)
69
+
70
+ # Retrieve documents based on top_k_indices
71
+ def retrieve_documents(top_k_indices):
72
+ return " ".join([corpus[i] for i in top_k_indices[0]])
73
+
74
+ context = retrieve_documents(top_k_indices)
75
+
76
  # Concatenate query and context
77
  input_text = f"{user_input} {context}"
78
 
 
87
  st.write(generated_text)
88
  else:
89
  st.write("Please enter a query.")