diff --git a/.gitattributes b/.gitattributes index 02167d30a564f5bbe33c525ec172f4eef0ec11d8..4a04528946ce8d9d3019245506ac229e3cf235fd 100644 --- a/.gitattributes +++ b/.gitattributes @@ -37,3 +37,50 @@ bitcoinforum/2_train_set_creation/inputs.csv filter=lfs diff=lfs merge=lfs -text bitcoinforum/3_training/old/bert/validation_samples.html filter=lfs diff=lfs merge=lfs -text bitcoinforum/5_processing_extracted_data/hardware_instances_inc_threads.csv filter=lfs diff=lfs merge=lfs -text bitcoinforum/5_processing_extracted_data/not_usable_threads.txt filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_021113_to_261213.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_090214_to_180414.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_180414_to_251214.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_250313_to_280613.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_251214_to_110123.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_261213_to_090214.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_280613_to_290813.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_290813_to_021113.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_020611_to_280313.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_080818_to_171221.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_120215_to_300915.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_160914_to_120215.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_171221_to_200923.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_230416_to_280417.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_240613_to_241013.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_241013_to_270114.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_250414_to_160914.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_270114_to_250414.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_280313_to_240613.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_280417_to_080818.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_300915_to_230416.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_011010_to_060911.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_060911_to_220613.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_220613_to_270215.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_270215_to_170923.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050116_to_080117.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050514_to_141114.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050615_to_050116.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_080117_to_220120.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_141114_to_050615.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_220120_to_250923.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_240211_to_050514.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_100915_to_110420.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_110420_to_200923.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_130511_to_100915.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_060312_to_090714.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_090714_to_191216.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_090910_to_060312.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_191216_to_250923.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_020715_to_291117.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_110812_to_130713.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_130713_to_140814.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_140711_to_261211.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_140814_to_020715.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_261211_to_110812.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_271110_to_140711.csv filter=lfs diff=lfs merge=lfs -text +Raw[[:space:]]Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_291117_to_250923.csv filter=lfs diff=lfs merge=lfs -text diff --git a/Raw Data/.DS_Store b/Raw Data/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..0fabda6a97cf307f90ff809d756ff211976bba47 Binary files /dev/null and b/Raw Data/.DS_Store differ diff --git a/Raw Data/MINING_sorted-preprocessed-data/.DS_Store b/Raw Data/MINING_sorted-preprocessed-data/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8e8d594e9a981a94c8b1aa287a871dc4f4e92d8c Binary files /dev/null and b/Raw Data/MINING_sorted-preprocessed-data/.DS_Store differ diff --git a/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_021113_to_261213.csv b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_021113_to_261213.csv new file mode 100644 index 0000000000000000000000000000000000000000..685d2662365bc6151575a406c23227370d54b16c --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_021113_to_261213.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeba523b5f20302b3a7139d87105dd3cb2f8dc0bc02f4d0fb561eb9566e2ce81 +size 25703434 diff --git a/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_090214_to_180414.csv b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_090214_to_180414.csv new file mode 100644 index 0000000000000000000000000000000000000000..7aba82f6da2e8a0a03503564b9aa92c54725bd57 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_090214_to_180414.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fa4e968e5636a8ba92cd06697992ba898a3e8b772c8ad4d2e0c6e334e7ad549 +size 26736726 diff --git a/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_180414_to_251214.csv b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_180414_to_251214.csv new file mode 100644 index 0000000000000000000000000000000000000000..1110657acd9196e18d556f4b647e46b235a588bb --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_180414_to_251214.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23208e9b4764bc6e324dbe852719c9cd4b17d3cf6b24b66d0b857ea730bae4ad +size 27390026 diff --git a/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_250313_to_280613.csv b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_250313_to_280613.csv new file mode 100644 index 0000000000000000000000000000000000000000..f3fbb7fd1bfd1bc50889cfddb0f8541dcf2e385c --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_250313_to_280613.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d0c04dba9735e524ceba3fd73aff6d94e3b54a6308b97200f57ad2f1f856ea4 +size 25663867 diff --git a/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_251214_to_110123.csv b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_251214_to_110123.csv new file mode 100644 index 0000000000000000000000000000000000000000..d59efd4464c00432c72e523dd2a46f7e546d9062 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_251214_to_110123.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb1da7ac51306e98b8f9c5bfc9dd63856005c829f6103eb1c9f4032ba6461fd +size 16019358 diff --git a/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_261213_to_090214.csv b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_261213_to_090214.csv new file mode 100644 index 0000000000000000000000000000000000000000..c4fb253e2a0b6ddfddb647c5e6327b08cc5556ff --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_261213_to_090214.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:288f2e937b34e1db3a5c6fb9b7c99a3ab73272072288c572c070893c6506eab7 +size 25982001 diff --git a/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_280613_to_290813.csv b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_280613_to_290813.csv new file mode 100644 index 0000000000000000000000000000000000000000..25c7a1117e231aa94c236187bfbe618394e7fe3f --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_280613_to_290813.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:407dc2fa87b7b51455956bf0a3359d6ecc29147110de3ef6a90d4277f32736bc +size 25590230 diff --git a/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_290813_to_021113.csv b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_290813_to_021113.csv new file mode 100644 index 0000000000000000000000000000000000000000..0e7658ba66536187dee87175361ca8f923c91734 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/groupbuys/BitcoinForum_groupbuys_290813_to_021113.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c1fbcc380a5ff80360f1bdf0e68b2b733e94b77da4df3c2e1f40839eb0acb69 +size 27560395 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_020611_to_280313.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_020611_to_280313.csv new file mode 100644 index 0000000000000000000000000000000000000000..f645bcfd955976c03fa8777afa394818be3c5e98 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_020611_to_280313.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:880bbd0bfa36d18ffe365c7c702fadf0ded68b839da4ae760218ed5b94585440 +size 28913387 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_080818_to_171221.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_080818_to_171221.csv new file mode 100644 index 0000000000000000000000000000000000000000..2ab4478b79eaeb393d4e828ddba2b064b7ea9322 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_080818_to_171221.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6af508ed543c293e1400ed4ae404040d16143b713daeae65489e4e9fd1fa101e +size 30268350 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_120215_to_300915.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_120215_to_300915.csv new file mode 100644 index 0000000000000000000000000000000000000000..62b74eef1a79819487b042683c4004edd065b9cf --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_120215_to_300915.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:487f842a143bb370c0ba0d374e0f02cd7fcfc402e01547a43699845aca1f7774 +size 33613759 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_160914_to_120215.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_160914_to_120215.csv new file mode 100644 index 0000000000000000000000000000000000000000..563b4d9f592be9c565ac9ff065a179e5e57cc9ad --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_160914_to_120215.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd57826f94c82fcc324db3fb5db4eec2eef7f4ed05c608642e9dc83a35891ad0 +size 33296450 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_171221_to_200923.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_171221_to_200923.csv new file mode 100644 index 0000000000000000000000000000000000000000..5161cdbca102211e4dcee2c18633d6d9155b3e16 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_171221_to_200923.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c82a144152589b81d256f5bb8524d6204ca1743536981e303d33a2d520bfde +size 14405234 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_230416_to_280417.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_230416_to_280417.csv new file mode 100644 index 0000000000000000000000000000000000000000..1a0ddea87e04c4c8be5fda93487be03af7e5180c --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_230416_to_280417.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bad5de4ea9da90ce98121ed8f6fccdf26889cb5907365983c11a9beb3ddeec1c +size 31339798 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_240613_to_241013.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_240613_to_241013.csv new file mode 100644 index 0000000000000000000000000000000000000000..d353037c2771276ba92fe4cc29bf775d9adc77da --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_240613_to_241013.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dfba7d5311dc600cd13a29e89bb1142fc7eb201b851cd03644980f37c41e94c +size 29545674 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_241013_to_270114.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_241013_to_270114.csv new file mode 100644 index 0000000000000000000000000000000000000000..41d1b5dbf767a0efc6557211d7376c2293e856de --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_241013_to_270114.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53bbb0e605c28b18264648a27aa81d223f409782d70a12b7ed5cd74fe566ce6b +size 30814293 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_250414_to_160914.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_250414_to_160914.csv new file mode 100644 index 0000000000000000000000000000000000000000..e2e2861526d00eb6ddcefeef01fd8361724ecc5c --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_250414_to_160914.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35fab6c162eb5fdeade21ea15076dfa4bce2f269af47d44818e5ae43b6c6bdc5 +size 33425670 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_270114_to_250414.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_270114_to_250414.csv new file mode 100644 index 0000000000000000000000000000000000000000..056713eb45b6a191510a147b31587211d762a852 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_270114_to_250414.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fced8a3adf4acfbf0e24425b953a51b348382fc320850dc05eb9d6bf0df1b30 +size 30779603 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_280313_to_240613.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_280313_to_240613.csv new file mode 100644 index 0000000000000000000000000000000000000000..58a45b9ca46045676fae0bd4c1a9b2681628e18a --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_280313_to_240613.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7ac3cd8decbe608412579e9c6ba6490d45f70c825c9e0c0dc4dca3e05bbb025 +size 29649291 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_280417_to_080818.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_280417_to_080818.csv new file mode 100644 index 0000000000000000000000000000000000000000..2180e08f7cd31dbf44294249e6bf70d18597c479 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_280417_to_080818.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56ab66ffef878b93b36c1acafd2686e44aa55f33dcd5915efee742e206e507c0 +size 29749806 diff --git a/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_300915_to_230416.csv b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_300915_to_230416.csv new file mode 100644 index 0000000000000000000000000000000000000000..98d96fe707cbdca5400c4ea9f9194059e3c29e50 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/hardware/BitcoinForum_hardware_300915_to_230416.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6bc808bba923b473680f1da3ebc74a1b625285f4bd8009c59033e146ad98e87 +size 35239136 diff --git a/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_011010_to_060911.csv b/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_011010_to_060911.csv new file mode 100644 index 0000000000000000000000000000000000000000..4ef45e6232dc91dc9007db0254cdaf3364f278d4 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_011010_to_060911.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7ba6b70970cf377fdcb8a5439e3a1ece4d7dff3f6aad53145e33a7aad59a301 +size 30548098 diff --git a/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_060911_to_220613.csv b/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_060911_to_220613.csv new file mode 100644 index 0000000000000000000000000000000000000000..f49b2b2ccb7ea7ea958b44d9cf1ebee186d4eb28 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_060911_to_220613.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60c3bae4272963bebb2114a61133434e2082618d1bac624e514b8b3ba813c34e +size 32227125 diff --git a/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_220613_to_270215.csv b/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_220613_to_270215.csv new file mode 100644 index 0000000000000000000000000000000000000000..248f691fa1a658943d8b4c06f3cb463fa3a73c46 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_220613_to_270215.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8649d9327a94062853bfca6c38d4f767df9190e527b732cc898adf0ab7f09e +size 33055879 diff --git a/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_270215_to_170923.csv b/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_270215_to_170923.csv new file mode 100644 index 0000000000000000000000000000000000000000..edb4636a225db1696185b398f4e7b7ebceb0852d --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/miners/BitcoinForum_miners_270215_to_170923.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74652992c32cff07e4b387f08a2fc96b36d37feda68febfe56da7960ea350ac4 +size 24231512 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_060312_to_090714.csv b/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_060312_to_090714.csv new file mode 100644 index 0000000000000000000000000000000000000000..08d1dcf63d6862071e9e1faa09b9e6e017165643 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_060312_to_090714.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5594857fb78cb85c3be39ff8128e52722900f4adf856799cf6da22403fb20a35 +size 28303598 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_090714_to_191216.csv b/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_090714_to_191216.csv new file mode 100644 index 0000000000000000000000000000000000000000..64dfc173cf40005feddf4cb3198211309e51d611 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_090714_to_191216.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720a7f8b75c483d93a17ab1644d9c9ef40867fcdfa1b4a61e66e1a9721a75fcb +size 27849977 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_090910_to_060312.csv b/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_090910_to_060312.csv new file mode 100644 index 0000000000000000000000000000000000000000..d89fa2f3c813f060eb5ad12c9b17835b06ef6eb0 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_090910_to_060312.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8d2c191a4a8bd662945df50391c5ede40c3b1445f0e9bc5ee49998245a90156 +size 26982459 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_191216_to_250923.csv b/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_191216_to_250923.csv new file mode 100644 index 0000000000000000000000000000000000000000..7f7a5b25789f04f861cfd58ced041163aead3f35 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining/BitcoinForum_mining_191216_to_250923.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b91b1df6b459b235737522c8aa6b9113f4c6faa6b6b02197db78c81a86c19b59 +size 32041386 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050116_to_080117.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050116_to_080117.csv new file mode 100644 index 0000000000000000000000000000000000000000..0d7369bb4b1fbbacd4bbe8569f33702dca117c7a --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050116_to_080117.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adeb7b18ac697eac54f7f0a34efca519351ed9c921d3c5f0b0a32bc4507b764b +size 31978755 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050514_to_141114.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050514_to_141114.csv new file mode 100644 index 0000000000000000000000000000000000000000..17837b24ae81383dac012b8eca7e56f4d6088c82 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050514_to_141114.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c9bcad808fda9c947ee9247d5c770967654c11a8ec4a1bb96437b93d2ffd624 +size 30115328 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050615_to_050116.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050615_to_050116.csv new file mode 100644 index 0000000000000000000000000000000000000000..65b42374dd5360b3024fc3ef1102ffc50e6f62f8 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_050615_to_050116.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0512c05700ec6fc439535f68fc51b4f2bada445daec201eb46a34ba79ed55ede +size 31901946 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_080117_to_220120.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_080117_to_220120.csv new file mode 100644 index 0000000000000000000000000000000000000000..c1c13dac90e3ce822993653ca5e2cca2ab222885 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_080117_to_220120.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1372089e4e779c9f181dccb49b6af15cf8ff74e2767aebb141addbd26c13cb01 +size 30875483 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_141114_to_050615.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_141114_to_050615.csv new file mode 100644 index 0000000000000000000000000000000000000000..3dfb19618a04535506974ecde7b61776dfed296a --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_141114_to_050615.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f820924655a21604232c67fcf52af24d6e13349d7b77f0635ea237bc2883426 +size 30082047 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_220120_to_250923.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_220120_to_250923.csv new file mode 100644 index 0000000000000000000000000000000000000000..734fcb38d5ba9be77fd07bdc8febba096c941552 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_220120_to_250923.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72acd66e2b025dda9198c44122a97063def9c5e5dd47057404c717684f31160e +size 15629305 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_240211_to_050514.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_240211_to_050514.csv new file mode 100644 index 0000000000000000000000000000000000000000..9592cfb031fd440055926417e52a340a8e2eff39 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_speculation/BitcoinForum_mining_speculation_240211_to_050514.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6debc82aec0039cb1fc0f63e4f9c0dc8c38db55cbb78d2d0c1a584b239b6d21b +size 28615700 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_100915_to_110420.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_100915_to_110420.csv new file mode 100644 index 0000000000000000000000000000000000000000..162307a181ce0fa281d3d59034e7a2989cf09e76 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_100915_to_110420.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba7dbde1b9e0e5db11d1adcc995b9c1cbbe17f2e0f77f05b7b9016adcec5d49f +size 42309746 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_110420_to_200923.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_110420_to_200923.csv new file mode 100644 index 0000000000000000000000000000000000000000..36177e4ade728f11a5de54216eae10ab152e2aed --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_110420_to_200923.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfac1490c6e10c51e64b47ed46ae2552b868a1d60f328a570fda1021966f42b9 +size 53228444 diff --git a/Raw Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_130511_to_100915.csv b/Raw Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_130511_to_100915.csv new file mode 100644 index 0000000000000000000000000000000000000000..6cb0b1eda280774d84eca25ce0f1287539272acc --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/mining_support/BitcoinForum_mining_support_130511_to_100915.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05f3c37b69b2ad60b7ff5844b51b86e1da367e6c0f6940a83b185735fe4dfb6b +size 28020381 diff --git a/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_020715_to_291117.csv b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_020715_to_291117.csv new file mode 100644 index 0000000000000000000000000000000000000000..fa06f42b4d6a723b33e621e0b20e251b69868bda --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_020715_to_291117.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea865ffcdf9c1898c8496c4a74e783a9d7a6b09474ce18193a2f4accead64b28 +size 30294267 diff --git a/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_110812_to_130713.csv b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_110812_to_130713.csv new file mode 100644 index 0000000000000000000000000000000000000000..ecde5e8173c985595b982521c7e50cbd1042f85f --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_110812_to_130713.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed08b1c202edd31248c092f7ac1e5229b86e2f9dacf76318fc985ea1a44166c +size 27325704 diff --git a/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_130713_to_140814.csv b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_130713_to_140814.csv new file mode 100644 index 0000000000000000000000000000000000000000..c4959e17fedc3b49d026c807b32b78f043853c27 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_130713_to_140814.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59a0aca3e152bb27a1809246565cc7850af9bbc947d9e9175b05a009d4cbd06c +size 28677997 diff --git a/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_140711_to_261211.csv b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_140711_to_261211.csv new file mode 100644 index 0000000000000000000000000000000000000000..70afbc6a67fb76e415edbc753300ea78b5d842d4 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_140711_to_261211.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:695a6da750bb3be0bcf738a5a723738d5a1325be85e53be9cb263c294b4f553c +size 27038276 diff --git a/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_140814_to_020715.csv b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_140814_to_020715.csv new file mode 100644 index 0000000000000000000000000000000000000000..97ef5a5767c9af4fd7287e7af40c41393234edad --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_140814_to_020715.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26cc4800091e369c841f2ec627f6b64aa7845c72c6263515d70d4c492d259f38 +size 29352900 diff --git a/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_261211_to_110812.csv b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_261211_to_110812.csv new file mode 100644 index 0000000000000000000000000000000000000000..46d46dc68946a9e61365b3f83fb3873143bf6316 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_261211_to_110812.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7779a6023eefe29c39a1103eea4029af91b7e008a27d587c29c834985ad979f +size 27836265 diff --git a/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_271110_to_140711.csv b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_271110_to_140711.csv new file mode 100644 index 0000000000000000000000000000000000000000..a75a29da9a327ad7f6589c28efafda391c39bd08 --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_271110_to_140711.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ab35a1a49cb17429d175b036fc66a2c96a4b48e939feb3762b1ad8fa4ab8920 +size 25300528 diff --git a/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_291117_to_250923.csv b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_291117_to_250923.csv new file mode 100644 index 0000000000000000000000000000000000000000..d047e8698d4a709475585352243e4ff03aa926cc --- /dev/null +++ b/Raw Data/MINING_sorted-preprocessed-data/pools/BitcoinForum_pools_291117_to_250923.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84ae1db166eb331a1c8bd7bcc3c6c0c68768774368ed9c2610d41692ca12658d +size 13032541 diff --git a/Raw Data/README.txt b/Raw Data/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..577c6a78e863ac9f02ec1d88a34e9536c14396d5 --- /dev/null +++ b/Raw Data/README.txt @@ -0,0 +1,53 @@ +# bitcointalk_crawler + +--- + +## DataFrame Columns Description + +### 1. `start_edit` +- **Description**: This column represents the date when the post or content was initially created. +- **Type**: Date (format: YYYY-MM-DD) +- **Example**: `2013-11-02` + +### 2. `last_edit` +- **Description**: This column represents the last date when the post or content was edited. +- **Type**: Date (format: YYYY-MM-DD) +- **Example**: `2013-11-02` + +### 3. `author` +- **Description**: The user who created the post. +- **Type**: String +- **Example**: `guyver` + +### 4. `post` +- **Description**: The actual content or message of the post. +- **Type**: String +- **Example**: `before we all get excited about the second batch...` + +### 5. `topic` +- **Description**: The topic or title of the thread in which the post was made. +- **Type**: String +- **Example**: `[EU/UK GROUP BUY] Blue Fury USB miner 2.2 ...` + +### 6. `attachment` +- **Description**: Indicates whether the post has an attachment or not. A value of `1` means there's an attachment(image or video), and `0` means there isn't. On the website, it uses img tag to show the emoji although that's not an attachment. The column here ignores the emojis, so '1' indicates a true attachment. +- **Type**: Integer (0 or 1) +- **Example**: `0` +- **Note**: The script 'attachment_fix.py' is run subsequent to the crawling process, as the initial values populated in this column post-crawling are not accurate. + +### 7. `link` +- **Description**: Indicates whether the post contains a link or not. A value of `1` means there's a link, and `0` means there isn't. +- **Type**: Integer (0 or 1) +- **Example**: `0` + +### 8. `original_info` +- **Description**: This column contains raw HTML or metadata related to the post. It may contain styling and layout information. +- **Type**: String (HTML format) +- **Example**: ` 0: + href_links = [link["href"] for link in nav_pages_links[:-1]] + href_links.insert(0, post_url) + for url in href_links: + df = read_subject_page(USER_AGENTS, url, df) + + else: + df = read_subject_page(USER_AGENTS, post_url, df) + + topic_id = post_url.split("topic=")[1] + df.to_csv(f"data/{board}/data_{topic_id}.csv", mode="w", index=False) + except Exception as e: + print(e) + with open(f"data/{board}/error_log.txt", "a") as f: + f.write(f"{post_url}\n -- {e}\n") + continue + + +# This function processes a post page. It extracts various details like timestamps, author information, post content, topic, attachments, links, and original HTML information. +# The function returns a dictionary containing all this extracted data. +def read_subject_page(USER_AGENTS, post_url, df): + time.sleep(1) + soup = get_web_content(post_url, USER_AGENTS) + form_tag = soup.find("form", id="quickModForm") + table_tag = form_tag.find("table", class_="bordercolor") + td_tag = table_tag.find_all("td", class_="windowbg") + td_tag.extend(table_tag.find_all("td", class_="windowbg2")) + + for comment in tqdm(td_tag): + res = extract_useful_content_windowbg(comment) + if res is not None: + df = pd.concat([df, pd.DataFrame([res])]) + + return df + + +# This function extracts meaningful content from a given HTML element (`tr_tag`). This tag is likely a row in a table, given its name. +# The function checks the presence of specific tags and classes within this row to extract information such as timestamps, author, post content, topic, attachments, and links. +# The extracted data is returned as a dictionary. +def extract_useful_content_windowbg(tr_tag): + """ + Timestamp of the post (ex: September 11, 2023, 07:49:45 AM; but if you want just 11/09/2023 is enough) + Author of the post (ex: SupermanBitcoin) + The post itself + + The topic where the post was posted (ex: [INFO - DISCUSSION] Security Budget Problem) eg. Whats your thoughts: Next-Gen Bitcoin Mining Machine With 1X Efficiency Rating. + Number of characters in the post --> so this is an integer + Does the post contain at least one attachment (image, video etc.) --> if yes put '1' in the column, if no, just put '0' + Does the post contain at least one link --> if yes put '1' in the column, if no, just put '0' + """ + headerandpost = tr_tag.find("td", class_="td_headerandpost") + if not headerandpost: + return None + + timestamp = headerandpost.find("div", class_="smalltext").get_text() + timestamps = timestamp.split("Last edit: ") + timestamp = timestamps[0].strip() + last_edit = None + if len(timestamps) > 1: + if "Today " in timestamps[1]: + print(timestamps[1]) + last_edit = ( + date.today().strftime("%B %d, %Y") + + ", " + + timestamps[1].split("by")[0].split("Today at")[1].strip() + ) + last_edit = timestamps[1].split("by")[0].strip() + + # print(timestamp) + # print(last_edit) + + poster_info_tag = tr_tag.find("td", class_="poster_info") + anchor_tag = poster_info_tag.find("a") + author = "Anonymous" if anchor_tag is None else anchor_tag.get_text() + # print(author) + + post = tr_tag.find("div", class_="post").get_text() + # print(post) + + topic = headerandpost.find("div", class_="subject").get_text() + # print(topic) + + attachments = headerandpost.find("img") + attachment = 0 if attachments is None else 1 + # print(attachment) + + links = headerandpost.find("a", class_="ul") + link = 0 if links is None else 1 + # print(link) + + original_info = headerandpost + + return { + "timestamp": timestamp, + "last_edit": last_edit, + "author": author.strip(), + "post": post.strip(), + "topic": topic.strip(), + "attachment": attachment, + "link": link, + "original_info": original_info, + } + + +# A utility function to save a list (e.g., URLs) to a text file. +# Each item in the list gets its own line in the file. +def save_page_file(data, file_name): + with open(file_name, "w") as filehandle: + for listitem in data: + filehandle.write("%s\\n" % listitem) + + +# This function sets up command-line arguments for the script, allowing users to provide input without modifying the code. +# Possible inputs include the starting URL, whether or not to update data, the board's name, and how many pages or posts to process. +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("url", help="url for the extraction") + parser.add_argument("--update", help="extract updated data", action="store_true") + parser.add_argument("--board", help="board name") + parser.add_argument( + "--num_of_pages", "-pages", help="number of pages to extract", type=int + ) + parser.add_argument( + "--num_of_posts_start", + "-posts", + help="the number of posts start to extract", + type=int, + default=0, + ) + return vars(parser.parse_args()) + + +# The main function orchestrates the scraping process. +# It sets up necessary directories, determines if data needs updating based on user input, +# collects page and post URLs, and processes the individual posts. +def main(url, update, board, num_of_pages, num_of_posts_start): + USER_AGENTS = get_web_component() + # Ensuring the data directory exists. + os.makedirs(f"data/{board}/", exist_ok=True) + pages_file_path = f"data/{board}/pages_urls.txt" + post_file_path = f"data/{board}/post_urls.txt" + # If the user chose to update the data, existing files are deleted to make way for new data. + if update: + if os.path.exists(pages_file_path): + os.remove(pages_file_path) + if os.path.exists(post_file_path): + os.remove(post_file_path) + # If the pages file doesn't exist, the script collects page URLs. + if not os.path.exists(pages_file_path): + pages_urls = loop_through_source_url(USER_AGENTS, url, num_of_pages) + save_page_file(pages_urls, pages_file_path) + # Reading the existing page URLs from the file. + with open(pages_file_path, "r") as filehandle: + pages_urls = [ + current_place.rstrip() for current_place in filehandle.readlines() + ] + # If the posts file doesn't exist, the script collects post URLs. + if not os.path.exists(post_file_path): + post_urls = loop_through_pages(USER_AGENTS, pages_urls) + save_page_file(post_urls, post_file_path) + # Reading the existing post URLs from the file. + with open(post_file_path, "r") as filehandle: + post_urls = [current_place.rstrip() for current_place in filehandle.readlines()] + # Processing individual posts (the specific processing steps were truncated in the original code). + loop_through_posts(USER_AGENTS, post_urls, board, num_of_posts_start) + + +if __name__ == "__main__": + main(**parse_args()) diff --git a/Raw Data/code/preprocessing.py b/Raw Data/code/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..44c3c93d06a1431b5ee13d43e91821777be37352 --- /dev/null +++ b/Raw Data/code/preprocessing.py @@ -0,0 +1,129 @@ +# Importing standard libraries +import os +import glob +import argparse +import pandas as pd +from tqdm import tqdm +from pathlib import Path + +# Additional preprocessing functions are imported from another module. +from preprocessing_sub_functions import * + + +# This function returns a list of all CSV files in the given directory path. +def get_files(path): + return glob.glob(path + "/*.csv") + + +# This function aims to remove meta information from the text. +# The specifics of what meta information is removed depends on the function 'remove_meta_info'. +def raw_preprocess(text): + text = remove_meta_info(text) + return text + + +# A comprehensive text preprocessing function that applies several common preprocessing steps: +# - URLs are removed from the text. +# - The entire text is converted to lowercase to ensure uniformity. +# - Punctuation is stripped from the text. +# - Extra whitespaces (if any) are removed. +# - The text is tokenized (split into individual words or tokens). +# - Contractions (like "can't" or "won't") are expanded to their full forms. +# - Common words (stopwords) that don't add significant meaning are removed. +# Finally, the cleaned tokens are joined back into a string. +def text_preprocess(text): + text = remove_urls(text) + text = to_lowercase(text) + text = remove_sentence_punctuation(text) + text = remove_extra_whitespace(text) + tokens = tokenize(text) + tokens = expand_contractions(tokens) + tokens = remove_stopwords(tokens) + text = " ".join(tokens) + return text + + +# This function preprocesses a dataframe. +# Specific preprocessing steps include: +# - Removing rows marked as 'deleted'. +# - Removing posts marked as 'deleted'. +# - Updating the 'lastEdit' column. +# - Converting timestamps to a datetime format. +# - Renaming the 'timestamp' column to 'start_edit'. +def csv_preprocess(df): + df = remove_deleted(df) + df = remove_deleted_post(df) + df = update_lastEdit(df) + df = convert_to_datetime(df) + df.rename(columns={"timestamp": "start_edit"}, inplace=True) + return df + + +# This function processes individual CSV files: +# - Reads the CSV into a DataFrame. +# - Applies dataframe preprocessing. +# - Applies raw text preprocessing to the 'post' column. +# - Saves the raw preprocessed data into a 'raw-data' folder. +# - Applies comprehensive text preprocessing to the 'post' column. +# - Saves the fully preprocessed data into a 'preprocessed-data' folder. +def loop_through_csvs(filePath): + file = os.path.basename(filePath) + folder = os.path.basename(os.path.dirname(filePath)) + df = pd.read_csv(filePath) + df = csv_preprocess(df) + + # Create a directory for raw data if it doesn't exist. + raw_folder = Path(f"raw-data/{folder}") + raw_folder.mkdir(parents=True, exist_ok=True) + + # Apply raw preprocessing to the 'post' column of the dataframe. + df["post"] = df["post"].apply(raw_preprocess) + + # Sort the dataframe by the 'last_edit' column. + df.sort_values(by=["last_edit"], inplace=True) + + # Save the raw preprocessed dataframe to a CSV file. + df.to_csv(f"{raw_folder}/{file}", index=False) + + # Create a directory for fully preprocessed data if it doesn't exist. + clean_folder = Path(f"preprocessed-data/{folder}") + clean_folder.mkdir(parents=True, exist_ok=True) + + # Apply the comprehensive text preprocessing to the 'post' column and store the result in a new column. + df["preprocessed_post"] = df["post"].apply(text_preprocess) + + # Sort the dataframe by the 'last_edit' column again. + df.sort_values(by=["last_edit"], inplace=True) + + # Save the fully preprocessed dataframe to a CSV file. + df.to_csv(f"{clean_folder}/{file}", index=False) + + return df + + +# A function to parse command-line arguments. +# The script expects a 'path' argument which indicates the directory where the raw CSV files are located. +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("path", help="path for the extraction") + return vars(parser.parse_args()) + + +# The main function of the script: +# - It retrieves all the CSV files from the specified directory. +# - Loops through each file, applying the preprocessing steps. +# - If an error occurs during processing, the error message is appended to an 'error_log.txt' file. +def main(path): + rawFiles = get_files(path) + for filePath in tqdm(rawFiles): + try: + df = loop_through_csvs(filePath) + except Exception as e: + # If an error occurs, log the error message to a file. + with open(f"{path}/error_log.txt", "a") as f: + f.write(f"{filePath} -- {e}\\n") + continue + + +if __name__ == "__main__": + main(**parse_args()) diff --git a/Raw Data/code/preprocessing.sh b/Raw Data/code/preprocessing.sh new file mode 100644 index 0000000000000000000000000000000000000000..48621efc840f71f7c941eee8694a5b539d14aa5e --- /dev/null +++ b/Raw Data/code/preprocessing.sh @@ -0,0 +1,7 @@ +python preprocessing.py /local/home/puwong/bitcoin/data/mining_support +python preprocessing.py /local/home/puwong/bitcoin/data/mining_speculation +python preprocessing.py /local/home/puwong/bitcoin/data/miners +python preprocessing.py /local/home/puwong/bitcoin/data/hardware +python preprocessing.py /local/home/puwong/bitcoin/data/groupbuys +python preprocessing.py /local/home/puwong/bitcoin/data/pools +python preprocessing.py /local/home/puwong/bitcoin/data/mining diff --git a/Raw Data/code/preprocessing_sub_functions.py b/Raw Data/code/preprocessing_sub_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..dc35dea9752ae92a397f8fab7705c6e8785e7158 --- /dev/null +++ b/Raw Data/code/preprocessing_sub_functions.py @@ -0,0 +1,239 @@ +# preprocessing sub functions + +import re +import os +import glob +import string +import pandas as pd +from datetime import datetime +import nltk +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +import contractions + + +def remove_deleted(df): + r""" + remove_deleted function. + This function appears to remove deleted post from crawled website data. + + Args: + df: dataframe of crawled website data. + + Returns: + df: dataframe of crawled website data without deleted post. + """ + # Remove rows where the 'timestamp' column is numeric + df = df[~df['timestamp'].str.isnumeric()] + df.reset_index(drop=True, inplace=True) + return df + + +def remove_deleted_post(df): + r""" + remove_deleted_post function. + This function appears to remove deleted post where is in another format. + + Args: + df: dataframe of crawled website data. + + Returns: + df: dataframe of crawled website data without deleted post. + """ + # Remove rows where the 'post' column contains 'del' + df = df[df['post'] != 'del'] + df.reset_index(drop=True, inplace=True) + return df + + +def update_lastEdit(df): + r""" + update_lastEdit function. + This function appears to fill NaN values in the 'last_edit' column with corresponding values from the 'timestamp' column + + Args: + df: dataframe of crawled website data. + + Returns: + df: dataframe of crawled website data with updated last_edit. + """ + df.loc[:, 'last_edit'] = df['last_edit'].fillna(df['timestamp']) + return df + + +def preprocess_date(date_str): + r""" + preprocess_date function. + This function appears to convert occurrences of 'Today' in a date string to the current date + Args: + date_str: str that contains date information. + + Returns: + str that contains date information with updated 'Today' to current date. + """ + if "Today " in date_str: + current_date = datetime.now().strftime("%B %d, %Y") + return date_str.replace("Today", current_date) + return date_str + + +def convert_datetime_with_multiple_formats(date_str, formats): + r""" + convert_datetime_with_multiple_formats function. + This function appears to Convert a date string to a datetime object using multiple possible formats. + + Args: + date_str: str that contains date information. + formats: list of possible date formats. + + Returns: + datetime object. + """ + for fmt in formats: + try: + return pd.to_datetime(date_str, format=fmt) + except ValueError: + continue + raise ValueError(f"Time data {date_str} doesn't match provided formats") + + +def convert_to_datetime(df_): + r""" + convert_to_datetime function. + This function appears to convert 'timestamp' and 'last_edit' columns to datetime format + + Args: + df_: dataframe of crawled website data. + + Returns: + df: dataframe of crawled website data with datatime format in 'timestamp' and 'last_edit' columns. + """ + df = df_.copy() + + # Preprocess 'timestamp' and 'last_edit' columns to handle 'Today' values + df['timestamp'] = df['timestamp'].apply(preprocess_date) + df['last_edit'] = df['last_edit'].apply(preprocess_date) + + # List of potential datetime formats + datetime_formats = ["%B %d, %Y at %I:%M:%S %p", "%B %d, %Y, %I:%M:%S %p"] + + df['timestamp'] = df['timestamp'].apply( + convert_datetime_with_multiple_formats, formats=datetime_formats) + df['timestamp'] = df['timestamp'].dt.date + df['last_edit'] = df['last_edit'].apply( + convert_datetime_with_multiple_formats, formats=datetime_formats) + df['last_edit'] = df['last_edit'].dt.date + + return df + + +def remove_urls(text): + r""" + remove_urls function. + This function appears to Remove URLs from a text. + """ + return re.sub(r'http\S+', '', text) + +# + + +def remove_extra_whitespace(text): + r""" + remove_extra_whitespace function. + This function appears to Remove extra whitespace characters from a text. + """ + return ' '.join(text.split()) + + +def remove_special_characters(text): + r""" + remove_special_characters function. + This function appears to remove special characters from a text. + """ + return re.sub(r'[^\w\s]', '', text) + + +def to_lowercase(text): + r""" + to_lowercase function. + This function appears to convert a text to lowercase. + """ + return text.lower() + + +def remove_meta_info(text): + r""" + remove_meta_info function. + This function appears to remove meta information where it contain quotes information. + """ + text = str(text) + return re.sub(r'Quote from: [a-zA-Z0-9_]+ on [a-zA-Z0-9, :]+ (AM|PM)', '', text) + + +def tokenize(text): + r""" + tokenize function. + This function appears to Tokenize a text into individual words. + """ + return text.split(' ') + + +def remove_sentence_punctuation(text): + r""" + remove_sentence_punctuation function. + This function appears to remove punctuation from a text, excluding math symbols. + """ + math_symbols = "+-×*÷/=()[]{},.<>%^" + punctuations_to_remove = ''.join( + set(string.punctuation) - set(math_symbols)) + return text.translate(str.maketrans(punctuations_to_remove, ' ' * len(punctuations_to_remove))) + + +def lemmatize_text(text): + r""" + lemmatize_text function. + This function appears to lemmatize text, where it convert words to their base form. + """ + lemmatizer = WordNetLemmatizer() + return ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) + + +def replace_numbers(text, replace_with=""): + r""" + replace_numbers function. + This function appears to replace numbers in a text with a specified string (default is ""). + """ + return re.sub(r'\b\d+\b', replace_with, text) + + +def remove_stopwords(tokens): + r""" + remove_stopwords function. + This function appears to remove stopwords from a list of tokens. + """ + stop_words = set(stopwords.words('english')) + return [word for word in tokens if word not in stop_words] + + +def expand_contractions(tokens): + r""" + expand_contractions function. + This function appears to expand contractions in a list of tokens (e.g., "isn't" to "is not") + """ + return [contractions.fix(word) for word in tokens] + + +def remove_repeated_phrases(text): + r""" + remove_repeated_phrases function. + This function appears to remove repeated phrases from a text. + eg. "hello hello world" -> "hello world" + """ + phrases = text.split() + seen = set() + output = [] + for phrase in phrases: + if phrase not in seen: + seen.add(phrase) + output.append(phrase) + return ' '.join(output) diff --git a/Raw Data/code/sort.py b/Raw Data/code/sort.py new file mode 100644 index 0000000000000000000000000000000000000000..bf6e3f55e89debe7c6c091d72c00a64921ca9935 --- /dev/null +++ b/Raw Data/code/sort.py @@ -0,0 +1,53 @@ +import os +import pandas as pd +from pathlib import Path +from tqdm import tqdm + +# Function to process and sort CSV files within a given folder + + +def process_csvs(folder_path, new_folder_name): + # Extracting the name of the board from the folder path + board = os.path.basename(folder_path) + # Creating a new directory to store the sorted CSV files + sorted_folder = Path(new_folder_name) + sorted_folder.mkdir(parents=True, exist_ok=True) + + # Retrieving all CSV files from the given folder path + all_files = [ + os.path.join(folder_path, file) + for file in os.listdir(folder_path) + if file.endswith(".csv") + ] + # Reading each CSV file into a dataframe + list_of_dataframes = [pd.read_csv(file) for file in all_files] + # Combining all dataframes into a single dataframe + combined_df = pd.concat(list_of_dataframes, ignore_index=True) + + # Sorting the combined dataframe based on the "last_edit" column + combined_df = combined_df.sort_values(by="last_edit") + + # Splitting the sorted dataframe into chunks of 10,000 rows each + num_chunks = len(combined_df) // 10000 + (1 if len(combined_df) % 10000 else 0) + chunks = [combined_df.iloc[i * 10000 : (i + 1) * 10000] for i in range(num_chunks)] + + # Saving each chunk as a separate CSV with a filename based on date ranges + for idx, chunk in tqdm(enumerate(chunks)): + start_date = pd.to_datetime(chunk["last_edit"].iloc[0]).strftime("%d%m%y") + end_date = pd.to_datetime(chunk["last_edit"].iloc[-1]).strftime("%d%m%y") + filename = f"BitcoinForum_{board}_{start_date}_to_{end_date}.csv" + chunk.to_csv(os.path.join(sorted_folder, filename), index=False) + + +folder_paths = [ + "./raw-data", + "./preprocessed-data", +] + +# Iterating over each folder path and processing its CSV files +for folder_path in folder_paths: + folder_name = os.path.basename(folder_path) + new_folder_name = f"sorted-{folder_name}" + for folder in tqdm(os.listdir(folder_path)): + if os.path.isdir(os.path.join(folder_path, folder)): + process_csvs(os.path.join(folder_path, folder), new_folder_name) diff --git a/Raw Data/code/website_format.json b/Raw Data/code/website_format.json new file mode 100644 index 0000000000000000000000000000000000000000..940bff0a8c3a2f8b4376a969e29519d0788570bd --- /dev/null +++ b/Raw Data/code/website_format.json @@ -0,0 +1,130 @@ +{ + "USER_AGENTS": [ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", + + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", + + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", + + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", + + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", + + "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", + + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", + + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", + + "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", + + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", + + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", + + "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", + + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", + + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12", + + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", + + "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.3 Mobile/14E277 Safari/603.1.30", + + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", + + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", + + "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", + + "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", + + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", + + "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", + + "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", + + "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", + + "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", + + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", + + "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", + + "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", + + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", + + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", + + "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", + + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", + + "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", + + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", + + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", + + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", + + "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", + + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", + + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", + + "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", + + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", + + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12", + + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", + + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", + + "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.3 Mobile/14E277 Safari/603.1.30", + + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + + "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)" + ] +}