T-K-O-H commited on
Commit
d2db90e
·
1 Parent(s): 2f7c4bc

huggingface issue 23

Browse files
Files changed (1) hide show
  1. app.py +230 -230
app.py CHANGED
@@ -693,168 +693,168 @@ class SentenceTransformerWrapper:
693
  """Synchronous embed function."""
694
  return self.embed_text(text)
695
 
696
- def evaluate_models(dataset):
697
- """Evaluate embedding models using RAGAS metrics."""
698
- try:
699
- # Initialize models
700
- openai_model = OpenAIEmbeddings(model="text-embedding-3-small")
701
-
702
- base_mpnet = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
703
- base_mpnet_wrapper = SentenceTransformerWrapper(base_mpnet)
704
-
705
- fine_tuned_model = SentenceTransformer("Shipmaster1/finetuned_mpnet_matryoshka_mnr")
706
- fine_tuned_wrapper = SentenceTransformerWrapper(fine_tuned_model)
707
-
708
- # Initialize evaluation metrics
709
- metrics = [
710
- faithfulness, # How well answers align with context
711
- answer_relevancy, # How relevant answers are to questions
712
- context_recall, # How well context covers required information
713
- context_precision # How focused and precise the context is
714
- ]
715
-
716
- # Create evaluation dataset with all required columns
717
- eval_dataset = Dataset.from_dict({
718
- "question": dataset["question"],
719
- "answer": dataset["answer"],
720
- "context": dataset["context"],
721
- "retrieved_contexts": [[ctx] for ctx in dataset["context"]], # Each context in its own list
722
- "reference": dataset["context"] # Using context as reference for recall calculation
723
- })
724
-
725
- # Evaluate each model and store results
726
- results = {}
727
-
728
- # OpenAI model evaluation
729
- openai_eval = evaluate(
730
- eval_dataset,
731
- metrics=metrics,
732
- embeddings=openai_model,
733
- llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
734
- )
735
- results["OpenAI"] = {
736
- "faithfulness": float(openai_eval._repr_dict["faithfulness"]),
737
- "answer_relevancy": float(openai_eval._repr_dict["answer_relevancy"]),
738
- "context_recall": float(openai_eval._repr_dict["context_recall"]),
739
- "context_precision": float(openai_eval._repr_dict["context_precision"])
740
- }
741
-
742
- # Base MPNet evaluation
743
- base_mpnet_eval = evaluate(
744
- eval_dataset,
745
- metrics=metrics,
746
- embeddings=base_mpnet_wrapper,
747
- llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
748
- )
749
- results["Base MPNet"] = {
750
- "faithfulness": float(base_mpnet_eval._repr_dict["faithfulness"]),
751
- "answer_relevancy": float(base_mpnet_eval._repr_dict["answer_relevancy"]),
752
- "context_recall": float(base_mpnet_eval._repr_dict["context_recall"]),
753
- "context_precision": float(base_mpnet_eval._repr_dict["context_precision"])
754
- }
755
-
756
- # Fine-tuned MPNet evaluation
757
- fine_tuned_eval = evaluate(
758
- eval_dataset,
759
- metrics=metrics,
760
- embeddings=fine_tuned_wrapper,
761
- llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
762
- )
763
- results["Fine-tuned MPNet"] = {
764
- "faithfulness": float(fine_tuned_eval._repr_dict["faithfulness"]),
765
- "answer_relevancy": float(fine_tuned_eval._repr_dict["answer_relevancy"]),
766
- "context_recall": float(fine_tuned_eval._repr_dict["context_recall"]),
767
- "context_precision": float(fine_tuned_eval._repr_dict["context_precision"])
768
- }
769
-
770
- return results
771
-
772
- except Exception as e:
773
- print(f"Error evaluating models: {str(e)}")
774
- return {}
775
-
776
- def create_comparison_plot(results):
777
- """Create a comparison plot of the evaluation metrics."""
778
- # Define metrics we're using
779
- metrics = [
780
- 'faithfulness',
781
- 'answer_relevancy',
782
- 'context_recall',
783
- 'context_precision'
784
- ]
785
 
786
- # Extract scores for each model
787
- models = list(results.keys())
788
- model_scores = {
789
- model: [results[model][metric] for metric in metrics]
790
- for model in models
791
- }
792
 
793
- fig = go.Figure()
794
 
795
- # Add traces for each model
796
- colors = {
797
- "OpenAI": 'rgb(55, 83, 109)',
798
- "Base MPNet": 'rgb(26, 118, 255)',
799
- "Fine-tuned MPNet": 'rgb(15, 196, 141)'
800
- }
801
 
802
- for model in models:
803
- fig.add_trace(go.Bar(
804
- name=model,
805
- x=metrics,
806
- y=model_scores[model],
807
- marker_color=colors.get(model, 'rgb(128, 128, 128)')
808
- ))
809
 
810
- # Update layout
811
- fig.update_layout(
812
- title='Model Comparison Metrics',
813
- xaxis_title='Metrics',
814
- yaxis_title='Score',
815
- barmode='group',
816
- yaxis=dict(range=[0, 1]),
817
- showlegend=True
818
- )
819
 
820
- return fig
821
 
822
- def run_ragas_evaluation():
823
- """Run the complete RAGAS evaluation process."""
824
- try:
825
- # Generate synthetic dataset
826
- dataset = create_synthetic_dataset()
827
 
828
- # Evaluate models
829
- results = evaluate_models(dataset)
830
 
831
- # Create comparison plot
832
- plot = create_comparison_plot(results)
833
 
834
- # Format results as markdown
835
- results_md = """## Model Evaluation Results
836
 
837
- ### Models Being Compared
838
- - **OpenAI Model**: text-embedding-3-small
839
- - **Base MPNet**: sentence-transformers/all-mpnet-base-v2
840
- - **Fine-tuned Model**: Shipmaster1/finetuned_mpnet_matryoshka_mnr
841
 
842
- ### OpenAI Model (text-embedding-3-small)
843
- """
844
- for metric in results["OpenAI"]:
845
- results_md += "- {}: {:.3f}\n".format(metric, results["OpenAI"][metric])
846
 
847
- results_md += "\n### Base MPNet Model (all-mpnet-base-v2)\n"
848
- for metric in results["Base MPNet"]:
849
- results_md += "- {}: {:.3f}\n".format(metric, results["Base MPNet"][metric])
850
 
851
- results_md += "\n### Fine-tuned Model (finetuned_mpnet_matryoshka_mnr)\n"
852
- for metric in results["Fine-tuned MPNet"]:
853
- results_md += "- {}: {:.3f}\n".format(metric, results["Fine-tuned MPNet"][metric])
854
 
855
- return results_md, plot
856
- except Exception as e:
857
- return f"Error during evaluation: {str(e)}", None
858
 
859
  def create_ui():
860
  with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
@@ -1016,39 +1016,39 @@ def create_ui():
1016
  """
1017
  )
1018
 
1019
- with gr.TabItem("RAGAS Evaluation"):
1020
- gr.Markdown(
1021
- """
1022
- # RAGAS Model Evaluation
1023
- Compare the performance of three embedding models using synthetic data.
1024
 
1025
- ### Models Being Evaluated
1026
- - **OpenAI Model**: text-embedding-3-small (Not Free)
1027
- - **Base MPNet**: sentence-transformers/all-mpnet-base-v2 (Open Source)
1028
- - **Fine-tuned Model**: Shipmaster1/finetuned_mpnet_matryoshka_mnr (Free Custom, trained on YouTube transcript handling)
1029
 
1030
- The evaluation uses GPT-3.5 Turbo to assess the quality of the embeddings through various metrics:
1031
- - Faithfulness: How well the answers align with the provided context
1032
- - Answer Relevancy: How relevant the answers are to the questions
1033
- - Context Recall: How well the model retrieves relevant context
1034
- - Context Precision: How precise the retrieved context is
1035
 
1036
 
1037
- Click the run button to find out how well the models perform on the synthetic data.
1038
- """
1039
- )
1040
 
1041
- with gr.Row():
1042
- evaluate_btn = gr.Button("Run Evaluation", variant="primary", size="lg")
1043
 
1044
- with gr.Row():
1045
- results_md = gr.Markdown(label="Evaluation Results")
1046
- plot_output = gr.Plot(label="Comparison Plot")
1047
 
1048
- evaluate_btn.click(
1049
- fn=run_ragas_evaluation,
1050
- outputs=[results_md, plot_output]
1051
- )
1052
 
1053
  def update_loading_state(stage: str):
1054
  """Update loading indicators based on current stage."""
@@ -1367,32 +1367,32 @@ def print_graph():
1367
  """)
1368
  print("-----------------------------\n")
1369
 
1370
- def extract_text_from_webpage(url: str) -> str:
1371
- """Extract main content text from a webpage."""
1372
- try:
1373
- # Use trafilatura for better content extraction
1374
- downloaded = trafilatura.fetch_url(url)
1375
- if downloaded:
1376
- text = trafilatura.extract(downloaded, include_links=False, include_images=False)
1377
- if text:
1378
- return text.strip()
1379
 
1380
- # Fallback to basic BeautifulSoup extraction
1381
- response = requests.get(url)
1382
- response.raise_for_status()
1383
- soup = BeautifulSoup(response.text, 'html.parser')
1384
-
1385
- # Remove script and style elements
1386
- for script in soup(["script", "style"]):
1387
- script.decompose()
1388
 
1389
- text = soup.get_text()
1390
- lines = (line.strip() for line in text.splitlines())
1391
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
1392
- text = ' '.join(chunk for chunk in chunks if chunk)
1393
- return text.strip()
1394
- except Exception as e:
1395
- raise Exception(f"Error extracting webpage content: {str(e)}")
1396
 
1397
  def process_youtube_video(video_url: str, model_name: str = "Shipmaster1/finetuned_mpnet_matryoshka_mnr"):
1398
  """Process a YouTube video and store its content in the vector store using LangChain."""
@@ -1434,40 +1434,40 @@ def process_youtube_video(video_url: str, model_name: str = "Shipmaster1/finetun
1434
  except Exception as e:
1435
  return None, f"Error processing video: {str(e)}"
1436
 
1437
- def process_webpage(url: str, model_name: str = "Shipmaster1/finetuned_mpnet_matryoshka_mnr"):
1438
- """Process a webpage and store its content in the vector store using LangChain."""
1439
- try:
1440
- # Get webpage content
1441
- content = extract_text_from_webpage(url)
1442
- if not content:
1443
- return None, "Failed to extract webpage content"
1444
 
1445
- # Create document with metadata
1446
- doc = Document(
1447
- page_content=content,
1448
- metadata={
1449
- "url": url,
1450
- "source": "webpage",
1451
- "timestamp": datetime.now().isoformat()
1452
- }
1453
- )
1454
-
1455
- # Split text into chunks
1456
- text_splitter = RecursiveCharacterTextSplitter(
1457
- chunk_size=1000,
1458
- chunk_overlap=200,
1459
- length_function=len,
1460
- )
1461
- chunks = text_splitter.split_documents([doc])
1462
-
1463
- # Store in Chroma using LangChain's abstraction
1464
- collection = get_chroma_collection(model_name)
1465
- collection.add_documents(chunks)
1466
-
1467
- return doc, "Successfully processed webpage"
1468
-
1469
- except Exception as e:
1470
- return None, f"Error processing webpage: {str(e)}"
1471
 
1472
  def agent_decide(state: ProcessState, progress=gr.Progress()) -> ProcessState:
1473
  """Agent decides whether to enhance content further based on verification score and creates an improvement plan."""
 
693
  """Synchronous embed function."""
694
  return self.embed_text(text)
695
 
696
+ # def evaluate_models(dataset):
697
+ # """Evaluate embedding models using RAGAS metrics."""
698
+ # try:
699
+ # # Initialize models
700
+ # openai_model = OpenAIEmbeddings(model="text-embedding-3-small")
701
+
702
+ # base_mpnet = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
703
+ # base_mpnet_wrapper = SentenceTransformerWrapper(base_mpnet)
704
+
705
+ # fine_tuned_model = SentenceTransformer("Shipmaster1/finetuned_mpnet_matryoshka_mnr")
706
+ # fine_tuned_wrapper = SentenceTransformerWrapper(fine_tuned_model)
707
+
708
+ # # Initialize evaluation metrics
709
+ # metrics = [
710
+ # faithfulness, # How well answers align with context
711
+ # answer_relevancy, # How relevant answers are to questions
712
+ # context_recall, # How well context covers required information
713
+ # context_precision # How focused and precise the context is
714
+ # ]
715
+
716
+ # # Create evaluation dataset with all required columns
717
+ # eval_dataset = Dataset.from_dict({
718
+ # "question": dataset["question"],
719
+ # "answer": dataset["answer"],
720
+ # "context": dataset["context"],
721
+ # "retrieved_contexts": [[ctx] for ctx in dataset["context"]], # Each context in its own list
722
+ # "reference": dataset["context"] # Using context as reference for recall calculation
723
+ # })
724
+
725
+ # # Evaluate each model and store results
726
+ # results = {}
727
+
728
+ # # OpenAI model evaluation
729
+ # openai_eval = evaluate(
730
+ # eval_dataset,
731
+ # metrics=metrics,
732
+ # embeddings=openai_model,
733
+ # llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
734
+ # )
735
+ # results["OpenAI"] = {
736
+ # "faithfulness": float(openai_eval._repr_dict["faithfulness"]),
737
+ # "answer_relevancy": float(openai_eval._repr_dict["answer_relevancy"]),
738
+ # "context_recall": float(openai_eval._repr_dict["context_recall"]),
739
+ # "context_precision": float(openai_eval._repr_dict["context_precision"])
740
+ # }
741
+
742
+ # # Base MPNet evaluation
743
+ # base_mpnet_eval = evaluate(
744
+ # eval_dataset,
745
+ # metrics=metrics,
746
+ # embeddings=base_mpnet_wrapper,
747
+ # llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
748
+ # )
749
+ # results["Base MPNet"] = {
750
+ # "faithfulness": float(base_mpnet_eval._repr_dict["faithfulness"]),
751
+ # "answer_relevancy": float(base_mpnet_eval._repr_dict["answer_relevancy"]),
752
+ # "context_recall": float(base_mpnet_eval._repr_dict["context_recall"]),
753
+ # "context_precision": float(base_mpnet_eval._repr_dict["context_precision"])
754
+ # }
755
+
756
+ # # Fine-tuned MPNet evaluation
757
+ # fine_tuned_eval = evaluate(
758
+ # eval_dataset,
759
+ # metrics=metrics,
760
+ # embeddings=fine_tuned_wrapper,
761
+ # llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
762
+ # )
763
+ # results["Fine-tuned MPNet"] = {
764
+ # "faithfulness": float(fine_tuned_eval._repr_dict["faithfulness"]),
765
+ # "answer_relevancy": float(fine_tuned_eval._repr_dict["answer_relevancy"]),
766
+ # "context_recall": float(fine_tuned_eval._repr_dict["context_recall"]),
767
+ # "context_precision": float(fine_tuned_eval._repr_dict["context_precision"])
768
+ # }
769
+
770
+ # return results
771
+
772
+ # except Exception as e:
773
+ # print(f"Error evaluating models: {str(e)}")
774
+ # return {}
775
+
776
+ # def create_comparison_plot(results):
777
+ # """Create a comparison plot of the evaluation metrics."""
778
+ # # Define metrics we're using
779
+ # metrics = [
780
+ # 'faithfulness',
781
+ # 'answer_relevancy',
782
+ # 'context_recall',
783
+ # 'context_precision'
784
+ # ]
785
 
786
+ # # Extract scores for each model
787
+ # models = list(results.keys())
788
+ # model_scores = {
789
+ # model: [results[model][metric] for metric in metrics]
790
+ # for model in models
791
+ # }
792
 
793
+ # fig = go.Figure()
794
 
795
+ # # Add traces for each model
796
+ # colors = {
797
+ # "OpenAI": 'rgb(55, 83, 109)',
798
+ # "Base MPNet": 'rgb(26, 118, 255)',
799
+ # "Fine-tuned MPNet": 'rgb(15, 196, 141)'
800
+ # }
801
 
802
+ # for model in models:
803
+ # fig.add_trace(go.Bar(
804
+ # name=model,
805
+ # x=metrics,
806
+ # y=model_scores[model],
807
+ # marker_color=colors.get(model, 'rgb(128, 128, 128)')
808
+ # ))
809
 
810
+ # # Update layout
811
+ # fig.update_layout(
812
+ # title='Model Comparison Metrics',
813
+ # xaxis_title='Metrics',
814
+ # yaxis_title='Score',
815
+ # barmode='group',
816
+ # yaxis=dict(range=[0, 1]),
817
+ # showlegend=True
818
+ # )
819
 
820
+ # return fig
821
 
822
+ # def run_ragas_evaluation():
823
+ # """Run the complete RAGAS evaluation process."""
824
+ # try:
825
+ # # Generate synthetic dataset
826
+ # dataset = create_synthetic_dataset()
827
 
828
+ # # Evaluate models
829
+ # results = evaluate_models(dataset)
830
 
831
+ # # Create comparison plot
832
+ # plot = create_comparison_plot(results)
833
 
834
+ # # Format results as markdown
835
+ # results_md = """## Model Evaluation Results
836
 
837
+ # ### Models Being Compared
838
+ # - **OpenAI Model**: text-embedding-3-small
839
+ # - **Base MPNet**: sentence-transformers/all-mpnet-base-v2
840
+ # - **Fine-tuned Model**: Shipmaster1/finetuned_mpnet_matryoshka_mnr
841
 
842
+ # ### OpenAI Model (text-embedding-3-small)
843
+ # """
844
+ # for metric in results["OpenAI"]:
845
+ # results_md += "- {}: {:.3f}\n".format(metric, results["OpenAI"][metric])
846
 
847
+ # results_md += "\n### Base MPNet Model (all-mpnet-base-v2)\n"
848
+ # for metric in results["Base MPNet"]:
849
+ # results_md += "- {}: {:.3f}\n".format(metric, results["Base MPNet"][metric])
850
 
851
+ # results_md += "\n### Fine-tuned Model (finetuned_mpnet_matryoshka_mnr)\n"
852
+ # for metric in results["Fine-tuned MPNet"]:
853
+ # results_md += "- {}: {:.3f}\n".format(metric, results["Fine-tuned MPNet"][metric])
854
 
855
+ # return results_md, plot
856
+ # except Exception as e:
857
+ # return f"Error during evaluation: {str(e)}", None
858
 
859
  def create_ui():
860
  with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
 
1016
  """
1017
  )
1018
 
1019
+ # with gr.TabItem("RAGAS Evaluation"):
1020
+ # gr.Markdown(
1021
+ # """
1022
+ # # RAGAS Model Evaluation
1023
+ # Compare the performance of three embedding models using synthetic data.
1024
 
1025
+ # ### Models Being Evaluated
1026
+ # - **OpenAI Model**: text-embedding-3-small (Not Free)
1027
+ # - **Base MPNet**: sentence-transformers/all-mpnet-base-v2 (Open Source)
1028
+ # - **Fine-tuned Model**: Shipmaster1/finetuned_mpnet_matryoshka_mnr (Free Custom, trained on YouTube transcript handling)
1029
 
1030
+ # The evaluation uses GPT-3.5 Turbo to assess the quality of the embeddings through various metrics:
1031
+ # - Faithfulness: How well the answers align with the provided context
1032
+ # - Answer Relevancy: How relevant the answers are to the questions
1033
+ # - Context Recall: How well the model retrieves relevant context
1034
+ # - Context Precision: How precise the retrieved context is
1035
 
1036
 
1037
+ # Click the run button to find out how well the models perform on the synthetic data.
1038
+ # """
1039
+ # )
1040
 
1041
+ # with gr.Row():
1042
+ # evaluate_btn = gr.Button("Run Evaluation", variant="primary", size="lg")
1043
 
1044
+ # with gr.Row():
1045
+ # results_md = gr.Markdown(label="Evaluation Results")
1046
+ # plot_output = gr.Plot(label="Comparison Plot")
1047
 
1048
+ # evaluate_btn.click(
1049
+ # fn=run_ragas_evaluation,
1050
+ # outputs=[results_md, plot_output]
1051
+ # )
1052
 
1053
  def update_loading_state(stage: str):
1054
  """Update loading indicators based on current stage."""
 
1367
  """)
1368
  print("-----------------------------\n")
1369
 
1370
+ # def extract_text_from_webpage(url: str) -> str:
1371
+ # """Extract main content text from a webpage."""
1372
+ # try:
1373
+ # # Use trafilatura for better content extraction
1374
+ # downloaded = trafilatura.fetch_url(url)
1375
+ # if downloaded:
1376
+ # text = trafilatura.extract(downloaded, include_links=False, include_images=False)
1377
+ # if text:
1378
+ # return text.strip()
1379
 
1380
+ # # Fallback to basic BeautifulSoup extraction
1381
+ # response = requests.get(url)
1382
+ # response.raise_for_status()
1383
+ # soup = BeautifulSoup(response.text, 'html.parser')
1384
+
1385
+ # # Remove script and style elements
1386
+ # for script in soup(["script", "style"]):
1387
+ # script.decompose()
1388
 
1389
+ # text = soup.get_text()
1390
+ # lines = (line.strip() for line in text.splitlines())
1391
+ # chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
1392
+ # text = ' '.join(chunk for chunk in chunks if chunk)
1393
+ # return text.strip()
1394
+ # except Exception as e:
1395
+ # raise Exception(f"Error extracting webpage content: {str(e)}")
1396
 
1397
  def process_youtube_video(video_url: str, model_name: str = "Shipmaster1/finetuned_mpnet_matryoshka_mnr"):
1398
  """Process a YouTube video and store its content in the vector store using LangChain."""
 
1434
  except Exception as e:
1435
  return None, f"Error processing video: {str(e)}"
1436
 
1437
+ # def process_webpage(url: str, model_name: str = "Shipmaster1/finetuned_mpnet_matryoshka_mnr"):
1438
+ # """Process a webpage and store its content in the vector store using LangChain."""
1439
+ # try:
1440
+ # # Get webpage content
1441
+ # content = extract_text_from_webpage(url)
1442
+ # if not content:
1443
+ # return None, "Failed to extract webpage content"
1444
 
1445
+ # # Create document with metadata
1446
+ # doc = Document(
1447
+ # page_content=content,
1448
+ # metadata={
1449
+ # "url": url,
1450
+ # "source": "webpage",
1451
+ # "timestamp": datetime.now().isoformat()
1452
+ # }
1453
+ # )
1454
+
1455
+ # # Split text into chunks
1456
+ # text_splitter = RecursiveCharacterTextSplitter(
1457
+ # chunk_size=1000,
1458
+ # chunk_overlap=200,
1459
+ # length_function=len,
1460
+ # )
1461
+ # chunks = text_splitter.split_documents([doc])
1462
+
1463
+ # # Store in Chroma using LangChain's abstraction
1464
+ # collection = get_chroma_collection(model_name)
1465
+ # collection.add_documents(chunks)
1466
+
1467
+ # return doc, "Successfully processed webpage"
1468
+
1469
+ # except Exception as e:
1470
+ # return None, f"Error processing webpage: {str(e)}"
1471
 
1472
  def agent_decide(state: ProcessState, progress=gr.Progress()) -> ProcessState:
1473
  """Agent decides whether to enhance content further based on verification score and creates an improvement plan."""