{
  "generated_at": "2026-07-01T17:49:18.976847+00:00",
  "catalog_path": "datasets/catalog.json",
  "raw_dir": "data/raw",
  "max_size_gb": 10.0,
  "include_large": false,
  "datasets": [
    {
      "id": "openforesight",
      "name": "OpenForesight",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "high",
      "source": "nikhilchandak/OpenForesight",
      "target": "data/raw/openforesight",
      "description": "Open-ended forecasting questions generated from global news, used to train OpenForecaster-8B.",
      "status": "downloaded",
      "remote": {
        "repo_id": "nikhilchandak/OpenForesight",
        "last_modified": "2026-04-30T16:55:18+00:00",
        "sha": "6db006a9f8416f26f1e38608b9623e1f3568144e",
        "tags": [
          "task_categories:text-generation",
          "task_categories:question-answering",
          "language:en",
          "license:mit",
          "size_categories:10K<n<100K",
          "format:parquet",
          "modality:text",
          "library:datasets",
          "library:dask",
          "library:polars",
          "library:mlcroissant",
          "arxiv:2512.25070",
          "region:us",
          "forecasting",
          "question-answering",
          "retrieval-augmented-generation",
          "news",
          "bayesian-reasoning"
        ],
        "remote_bytes": 478372096,
        "remote_size": "456.21 MiB",
        "remote_file_count": 12,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 11466
          },
          {
            "path": "assets/accuracy_trainsamples_llama-3.1-8b.png",
            "bytes": 204244
          },
          {
            "path": "assets/brier_scores_trainsamples_llama-3.1-8b.png",
            "bytes": 194330
          },
          {
            "path": "data/aljazeera2026Q1-00000-of-00001.parquet",
            "bytes": 1362638
          },
          {
            "path": "data/aljazeeraLate2025-00000-of-00001.parquet",
            "bytes": 4652896
          },
          {
            "path": "data/skysports2025-00000-of-00001.parquet",
            "bytes": 6901245
          },
          {
            "path": "data/test-00000-of-00001.parquet",
            "bytes": 2793065
          },
          {
            "path": "data/train-00000-of-00003.parquet",
            "bytes": 154156215
          },
          {
            "path": "data/train-00001-of-00003.parquet",
            "bytes": 157010712
          },
          {
            "path": "data/train-00002-of-00003.parquet",
            "bytes": 148410304
          },
          {
            "path": "data/validation-00000-of-00001.parquet",
            "bytes": 2672520
          }
        ]
      },
      "local": {
        "files": 20,
        "bytes": 477972343
      }
    },
    {
      "id": "kalshibench_v2",
      "name": "KalshiBench v2",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "high",
      "source": "2084Collective/kalshibench-v2",
      "target": "data/raw/kalshibench_v2",
      "description": "Kalshi prediction-market questions with real-world outcomes, used for calibration/evaluation.",
      "status": "downloaded",
      "remote": {
        "repo_id": "2084Collective/kalshibench-v2",
        "last_modified": "2025-12-16T18:29:55+00:00",
        "sha": "e70c1077093e12ca604429d7f7dc7562127e81f2",
        "tags": [
          "size_categories:1K<n<10K",
          "format:parquet",
          "format:optimized-parquet",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:polars",
          "library:mlcroissant",
          "region:us"
        ],
        "remote_bytes": 205724,
        "remote_size": "200.90 KiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 587
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 202676
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 203679
      }
    },
    {
      "id": "kalshibench_v1",
      "name": "KalshiBench v1",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "2084Collective/kalshibench-v1",
      "target": "data/raw/kalshibench_v1",
      "description": "Earlier KalshiBench release; useful for version drift and benchmark reproducibility checks.",
      "status": "downloaded",
      "remote": {
        "repo_id": "2084Collective/kalshibench-v1",
        "last_modified": "2025-12-15T23:25:11+00:00",
        "sha": "d6c79132f972c163a5d111bc2d11061a2ff85ec7",
        "tags": [
          "size_categories:n<1K",
          "format:parquet",
          "format:optimized-parquet",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:polars",
          "library:mlcroissant",
          "region:us"
        ],
        "remote_bytes": 165597,
        "remote_size": "161.72 KiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 586
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 162550
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 163552
      }
    },
    {
      "id": "forecastbench",
      "name": "ForecastBench datasets",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "high",
      "source": "forecastingresearch/forecastbench-datasets",
      "target": "data/raw/forecastbench",
      "description": "Datasets produced by the ForecastBench dynamic forecasting benchmark, including LLM and human forecast sets.",
      "status": "downloaded",
      "remote": {
        "repo_id": "forecastingresearch/forecastbench-datasets",
        "last_modified": "2026-01-01T02:03:41+00:00",
        "sha": "ee2c2f5fe59a909a7e26bd9bbcc6febad869aace",
        "tags": [
          "language:en",
          "license:cc-by-sa-4.0",
          "arxiv:2409.19839",
          "region:us"
        ],
        "remote_bytes": 122047340,
        "remote_size": "116.39 MiB",
        "remote_file_count": 46,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 187
          },
          {
            "path": "LICENSE",
            "bytes": 20194
          },
          {
            "path": "README.md",
            "bytes": 1263
          },
          {
            "path": "datasets/forecast_sets/2024-07-21/2024-07-21.ForecastBench.human_public_individual.json",
            "bytes": 23526935
          },
          {
            "path": "datasets/forecast_sets/2024-07-21/2024-07-21.ForecastBench.human_super_individual.json",
            "bytes": 4595808
          },
          {
            "path": "datasets/question_sets/2024-07-21-human.json",
            "bytes": 553367
          },
          {
            "path": "datasets/question_sets/2024-07-21-llm.json",
            "bytes": 4831631
          },
          {
            "path": "datasets/question_sets/2025-03-02-llm.json",
            "bytes": 4762520
          },
          {
            "path": "datasets/question_sets/2025-03-16-llm.json",
            "bytes": 4773527
          },
          {
            "path": "datasets/question_sets/2025-03-30-llm.json",
            "bytes": 4735898
          },
          {
            "path": "datasets/question_sets/2025-04-13-llm.json",
            "bytes": 4757597
          },
          {
            "path": "datasets/question_sets/2025-04-27-llm.json",
            "bytes": 4729971
          },
          {
            "path": "datasets/question_sets/2025-05-11-llm.json",
            "bytes": 4762386
          },
          {
            "path": "datasets/question_sets/2025-05-25-llm.json",
            "bytes": 4763509
          },
          {
            "path": "datasets/question_sets/2025-06-08-llm.json",
            "bytes": 4787653
          },
          {
            "path": "datasets/question_sets/2025-06-22-llm.json",
            "bytes": 4746897
          },
          {
            "path": "datasets/question_sets/2025-08-03-llm.json",
            "bytes": 4810845
          },
          {
            "path": "datasets/question_sets/2025-08-17-llm.json",
            "bytes": 4874256
          },
          {
            "path": "datasets/question_sets/2025-08-31-llm.json",
            "bytes": 4927592
          },
          {
            "path": "datasets/question_sets/2025-10-26-llm.json",
            "bytes": 1337196
          },
          {
            "path": "datasets/question_sets/2025-11-09-llm.json",
            "bytes": 1331351
          },
          {
            "path": "datasets/question_sets/2025-11-23-llm.json",
            "bytes": 1387228
          },
          {
            "path": "datasets/question_sets/2025-12-07-llm.json",
            "bytes": 1377773
          },
          {
            "path": "datasets/question_sets/2025-12-21-llm.json",
            "bytes": 1384658
          },
          {
            "path": "datasets/question_sets/latest-llm.json",
            "bytes": 19
          }
        ]
      },
      "local": {
        "files": 88,
        "bytes": 121875536
      }
    },
    {
      "id": "futurex_past",
      "name": "FutureX Past",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "high",
      "source": "futurex-ai/Futurex-Past",
      "target": "data/raw/futurex_past",
      "description": "Resolved FutureX past questions for dynamic future-prediction evaluation.",
      "status": "downloaded",
      "remote": {
        "repo_id": "futurex-ai/Futurex-Past",
        "last_modified": "2026-06-28T16:12:45+00:00",
        "sha": "d28cfbac8dc5b61ec696fa9517b4087a0cbf7349",
        "tags": [
          "task_categories:question-answering",
          "task_categories:text-generation",
          "language:zh",
          "language:en",
          "license:apache-2.0",
          "size_categories:1K<n<10K",
          "format:parquet",
          "format:optimized-parquet",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:polars",
          "library:mlcroissant",
          "arxiv:2508.11987",
          "region:us",
          "future-prediction",
          "benchmark",
          "llm-agents",
          "real-world-events"
        ],
        "remote_bytes": 258224,
        "remote_size": "252.17 KiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 5792
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 249971
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 256179
      }
    },
    {
      "id": "futurex_online",
      "name": "FutureX Online",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "futurex-ai/Futurex-Online",
      "target": "data/raw/futurex_online",
      "description": "Current/live FutureX questions, useful for prospective tests but labels may be unresolved.",
      "status": "downloaded",
      "remote": {
        "repo_id": "futurex-ai/Futurex-Online",
        "last_modified": "2026-06-28T16:12:32+00:00",
        "sha": "221d782f16b0e865b9f33cb205331c8eae2fc982",
        "tags": [
          "task_categories:question-answering",
          "size_categories:n<1K",
          "format:parquet",
          "format:optimized-parquet",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:polars",
          "library:mlcroissant",
          "arxiv:2508.11987",
          "region:us",
          "future",
          "prediction",
          "LLM",
          "Agents"
        ],
        "remote_bytes": 28282,
        "remote_size": "27.62 KiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 2094
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 23727
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 26237
      }
    },
    {
      "id": "prophet_arena_100",
      "name": "Prophet Arena Subset 100",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "prophetarena/Prophet-Arena-Subset-100",
      "target": "data/raw/prophet_arena_100",
      "description": "Small Prophet Arena public subset for prediction-market-style event evaluation.",
      "status": "downloaded",
      "remote": {
        "repo_id": "prophetarena/Prophet-Arena-Subset-100",
        "last_modified": "2025-09-10T15:05:17+00:00",
        "sha": "0451d79d8baf9cb6cf32bd15b23ddb4288dd48d0",
        "tags": [
          "language:en",
          "license:mit",
          "size_categories:n<1K",
          "format:csv",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us"
        ],
        "remote_bytes": 2149305,
        "remote_size": "2.05 MiB",
        "remote_file_count": 5,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 6782
          },
          {
            "path": "standalone_evaluator.py",
            "bytes": 5144
          },
          {
            "path": "standalone_predictor.py",
            "bytes": 19805
          },
          {
            "path": "subset_data_100.csv",
            "bytes": 2115113
          }
        ]
      },
      "local": {
        "files": 10,
        "bytes": 2147436
      }
    },
    {
      "id": "prophet_arena_1200",
      "name": "Prophet Arena Subset 1200",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "prophetarena/Prophet-Arena-Subset-1200",
      "target": "data/raw/prophet_arena_1200",
      "description": "Larger Prophet Arena public subset for event forecasting evaluation.",
      "status": "downloaded",
      "remote": {
        "repo_id": "prophetarena/Prophet-Arena-Subset-1200",
        "last_modified": "2025-11-21T08:10:10+00:00",
        "sha": "c94b6f450d7fe3b03688799cce1c8b29838b5d96",
        "tags": [
          "language:en",
          "license:mit",
          "size_categories:1K<n<10K",
          "format:csv",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us"
        ],
        "remote_bytes": 9023945,
        "remote_size": "8.61 MiB",
        "remote_file_count": 5,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 7784
          },
          {
            "path": "standalone_evaluator.py",
            "bytes": 4867
          },
          {
            "path": "standalone_predictor.py",
            "bytes": 20310
          },
          {
            "path": "subset_data_1200.csv",
            "bytes": 8988523
          }
        ]
      },
      "local": {
        "files": 10,
        "bytes": 9022077
      }
    },
    {
      "id": "metaculus_binary_chandak",
      "name": "Metaculus Binary",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "nikhilchandak/metaculus-binary",
      "target": "data/raw/metaculus_binary_chandak",
      "description": "Binary Metaculus question dataset useful for non-market crowd-forecast baselines.",
      "status": "downloaded",
      "remote": {
        "repo_id": "nikhilchandak/metaculus-binary",
        "last_modified": "2025-02-17T17:07:13+00:00",
        "sha": "f0005f2fcb4cb39fa4cdbf84ca06f742729764f3",
        "tags": [
          "license:mit",
          "size_categories:1K<n<10K",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us"
        ],
        "remote_bytes": 3540749,
        "remote_size": "3.38 MiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 843
          },
          {
            "path": "train/train-00000-of-00001.parquet",
            "bytes": 3537445
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 3538704
      }
    },
    {
      "id": "metaculus_binary_jijivski",
      "name": "Metaculus Binary Legacy",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "jijivski/metaculus_binary",
      "target": "data/raw/metaculus_binary_jijivski",
      "description": "Small legacy Metaculus binary dataset; useful mostly for schema comparison.",
      "status": "downloaded",
      "remote": {
        "repo_id": "jijivski/metaculus_binary",
        "last_modified": "2024-02-20T16:39:55+00:00",
        "sha": "09471b75e552e9566b57650977eb0da955343c53",
        "tags": [
          "license:apache-2.0",
          "size_categories:n<1K",
          "format:json",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us"
        ],
        "remote_bytes": 92130,
        "remote_size": "89.97 KiB",
        "remote_file_count": 4,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2307
          },
          {
            "path": "README.md",
            "bytes": 28
          },
          {
            "path": "metaculus_binary.json",
            "bytes": 0
          },
          {
            "path": "test.json",
            "bytes": 89795
          }
        ]
      },
      "local": {
        "files": 8,
        "bytes": 90317
      }
    },
    {
      "id": "forecast_snapshots_metaculus_large",
      "name": "Forecast Snapshots - Metaculus",
      "stream": "social_forecasting",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "chestnutforty/forecast-snapshots-metaculus-6f1cdfd9b3",
      "target": "data/raw/forecast_snapshots_metaculus_large",
      "description": "Timestamped Metaculus forecast snapshots for point-in-time aggregation and leakage checks.",
      "status": "downloaded",
      "remote": {
        "repo_id": "chestnutforty/forecast-snapshots-metaculus-6f1cdfd9b3",
        "last_modified": "2025-11-15T02:18:25+00:00",
        "sha": "82c3dc074edcc560c520722ccc5c4c1470dfc449",
        "tags": [
          "license:mit",
          "size_categories:1K<n<10K",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us",
          "prediction-markets",
          "metaculus",
          "snapshot-dataset"
        ],
        "remote_bytes": 6193092,
        "remote_size": "5.91 MiB",
        "remote_file_count": 5,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2518
          },
          {
            "path": "README.md",
            "bytes": 1617
          },
          {
            "path": "config.json",
            "bytes": 339
          },
          {
            "path": "snapshot_dataset.csv",
            "bytes": 5653170
          },
          {
            "path": "snapshot_dataset.parquet",
            "bytes": 535448
          }
        ]
      },
      "local": {
        "files": 10,
        "bytes": 6191214
      }
    },
    {
      "id": "forecast_snapshots_metaculus_small",
      "name": "Forecast Snapshots - Metaculus Small",
      "stream": "social_forecasting",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "chestnutforty/forecast-snapshots-metaculus-2cc65706d0",
      "target": "data/raw/forecast_snapshots_metaculus_small",
      "description": "Smaller timestamped Metaculus forecast snapshot mirror.",
      "status": "downloaded",
      "remote": {
        "repo_id": "chestnutforty/forecast-snapshots-metaculus-2cc65706d0",
        "last_modified": "2025-11-15T22:50:24+00:00",
        "sha": "17b3a847cc4549843b1d66862c2612bfe754dfb8",
        "tags": [
          "license:mit",
          "size_categories:n<1K",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us",
          "prediction-markets",
          "metaculus",
          "snapshot-dataset"
        ],
        "remote_bytes": 609102,
        "remote_size": "594.83 KiB",
        "remote_file_count": 5,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 1617
          },
          {
            "path": "config.json",
            "bytes": 383
          },
          {
            "path": "snapshot_dataset.csv",
            "bytes": 447601
          },
          {
            "path": "snapshot_dataset.parquet",
            "bytes": 157040
          }
        ]
      },
      "local": {
        "files": 10,
        "bytes": 607259
      }
    },
    {
      "id": "ir_event_forecasting_sample",
      "name": "IR Event Forecasting Sample",
      "stream": "dataset_construction",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "EventForecasting/IR_event_forecasting_sample",
      "target": "data/raw/ir_event_forecasting_sample",
      "description": "Small international-relations event forecasting sample dataset.",
      "status": "downloaded",
      "remote": {
        "repo_id": "EventForecasting/IR_event_forecasting_sample",
        "last_modified": "2025-03-29T10:11:03+00:00",
        "sha": "febf69ebfaa2fdc8165876189b464349e6e5b940",
        "tags": [
          "size_categories:n<1K",
          "format:parquet",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us"
        ],
        "remote_bytes": 90277,
        "remote_size": "88.16 KiB",
        "remote_file_count": 4,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 816
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 58712
          },
          {
            "path": "data/valid-00000-of-00001.parquet",
            "bytes": 28288
          }
        ]
      },
      "local": {
        "files": 8,
        "bytes": 88356
      }
    },
    {
      "id": "kalshi_markets",
      "name": "Kalshi Prediction Markets - Markets",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "high",
      "source": "thomaswmitch/kalshi-prediction-markets-markets",
      "target": "data/raw/kalshi_markets",
      "description": "Kalshi market metadata.",
      "status": "downloaded",
      "remote": {
        "repo_id": "thomaswmitch/kalshi-prediction-markets-markets",
        "last_modified": "2025-09-19T02:52:08+00:00",
        "sha": "ca16c4c91c475cd6406dbab703a41dd12eaf14f6",
        "tags": [
          "task_categories:other",
          "language:en",
          "license:mit",
          "size_categories:10K<n<100K",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us",
          "prediction-markets",
          "kalshi",
          "markets",
          "economics",
          "finance",
          "time-series"
        ],
        "remote_bytes": 1438624,
        "remote_size": "1.37 MiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 3599
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 1432564
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 1436580
      }
    },
    {
      "id": "kalshi_trades_wmitch",
      "name": "Kalshi Prediction Markets - Trades",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "high",
      "source": "thomaswmitch/kalshi-prediction-markets-betting",
      "target": "data/raw/kalshi_trades_wmitch",
      "description": "Kalshi trade-level data, useful for market microstructure and execution simulation.",
      "status": "downloaded",
      "remote": {
        "repo_id": "thomaswmitch/kalshi-prediction-markets-betting",
        "last_modified": "2025-09-19T02:50:17+00:00",
        "sha": "2ce5d0a00809667083cbe26788e8fcd1b70bd498",
        "tags": [
          "task_categories:other",
          "language:en",
          "license:mit",
          "size_categories:1M<n<10M",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:dask",
          "library:mlcroissant",
          "library:polars",
          "region:us",
          "prediction-markets",
          "kalshi",
          "trades",
          "economics",
          "finance",
          "time-series"
        ],
        "remote_bytes": 276873446,
        "remote_size": "264.05 MiB",
        "remote_file_count": 4,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 2130
          },
          {
            "path": "data/train-00000-of-00002.parquet",
            "bytes": 138396419
          },
          {
            "path": "data/train-00001-of-00002.parquet",
            "bytes": 138472436
          }
        ]
      },
      "local": {
        "files": 8,
        "bytes": 276871528
      }
    },
    {
      "id": "kalshi_trades_trevorjs",
      "name": "Kalshi Trades",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "high",
      "source": "TrevorJS/kalshi-trades",
      "target": "data/raw/kalshi_trades_trevorjs",
      "description": "Larger Kalshi markets/trades parquet mirror tagged for prediction-market research.",
      "status": "downloaded",
      "remote": {
        "repo_id": "TrevorJS/kalshi-trades",
        "last_modified": "2026-01-30T03:41:44+00:00",
        "sha": "5c622baffd0c8eeac8001ba05525803f544e4953",
        "tags": [
          "task_categories:tabular-classification",
          "task_categories:time-series-forecasting",
          "license:cc-by-4.0",
          "size_categories:100M<n<1B",
          "modality:tabular",
          "modality:text",
          "region:us",
          "prediction-markets",
          "kalshi",
          "finance",
          "trading",
          "event-contracts"
        ],
        "remote_bytes": 5685402748,
        "remote_size": "5.29 GiB",
        "remote_file_count": 22,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 3577
          },
          {
            "path": "markets-0000.parquet",
            "bytes": 136002798
          },
          {
            "path": "markets-0001.parquet",
            "bytes": 364296885
          },
          {
            "path": "markets-0002.parquet",
            "bytes": 400851398
          },
          {
            "path": "markets-0003.parquet",
            "bytes": 212771767
          },
          {
            "path": "trades-0000.parquet",
            "bytes": 313703141
          },
          {
            "path": "trades-0001.parquet",
            "bytes": 288411075
          },
          {
            "path": "trades-0002.parquet",
            "bytes": 291402018
          },
          {
            "path": "trades-0003.parquet",
            "bytes": 293928757
          },
          {
            "path": "trades-0004.parquet",
            "bytes": 289103530
          },
          {
            "path": "trades-0005.parquet",
            "bytes": 293226393
          },
          {
            "path": "trades-0006.parquet",
            "bytes": 302618306
          },
          {
            "path": "trades-0007.parquet",
            "bytes": 304766665
          },
          {
            "path": "trades-0008.parquet",
            "bytes": 295039409
          },
          {
            "path": "trades-0009.parquet",
            "bytes": 296463576
          },
          {
            "path": "trades-0010.parquet",
            "bytes": 296655312
          },
          {
            "path": "trades-0011.parquet",
            "bytes": 302424873
          },
          {
            "path": "trades-0012.parquet",
            "bytes": 281766675
          },
          {
            "path": "trades-0013.parquet",
            "bytes": 287991626
          },
          {
            "path": "trades-0014.parquet",
            "bytes": 292156987
          },
          {
            "path": "trades-0015.parquet",
            "bytes": 141815519
          }
        ]
      },
      "local": {
        "files": 44,
        "bytes": 5685403066
      }
    },
    {
      "id": "forecast_snapshots_kalshi",
      "name": "Forecast Snapshots - Kalshi Events",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "chestnutforty/forecast-snapshots-kalshi_events-768472771c",
      "target": "data/raw/forecast_snapshots_kalshi",
      "description": "Timestamped Kalshi event snapshots.",
      "status": "downloaded",
      "remote": {
        "repo_id": "chestnutforty/forecast-snapshots-kalshi_events-768472771c",
        "last_modified": "2025-11-15T02:04:39+00:00",
        "sha": "c1011d64746393fed42688c9fa76d7fee88945b0",
        "tags": [
          "license:mit",
          "size_categories:10K<n<100K",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us",
          "prediction-markets",
          "kalshi_events",
          "snapshot-dataset"
        ],
        "remote_bytes": 13111727,
        "remote_size": "12.50 MiB",
        "remote_file_count": 5,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2518
          },
          {
            "path": "README.md",
            "bytes": 1633
          },
          {
            "path": "config.json",
            "bytes": 347
          },
          {
            "path": "snapshot_dataset.csv",
            "bytes": 12614878
          },
          {
            "path": "snapshot_dataset.parquet",
            "bytes": 492351
          }
        ]
      },
      "local": {
        "files": 10,
        "bytes": 13109850
      }
    },
    {
      "id": "kalshi_filtered",
      "name": "Kalshi Filtered",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "dzorlu/kalshi-filtered",
      "target": "data/raw/kalshi_filtered",
      "description": "Small filtered Kalshi dataset for schema comparison.",
      "status": "downloaded",
      "remote": {
        "repo_id": "dzorlu/kalshi-filtered",
        "last_modified": "2025-11-18T05:57:58+00:00",
        "sha": "40f715999a9004ad576a88da593aacbcbb14f7f6",
        "tags": [
          "size_categories:1K<n<10K",
          "format:parquet",
          "format:optimized-parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:polars",
          "library:mlcroissant",
          "region:us"
        ],
        "remote_bytes": 180307,
        "remote_size": "176.08 KiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 1440
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 176406
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 178262
      }
    },
    {
      "id": "kalshi_prop_closes",
      "name": "Kalshi Prop Closes",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "mvpeav/kalshi-prop-closes",
      "target": "data/raw/kalshi_prop_closes",
      "description": "Daily Kalshi proposition close records, useful for market-price baselines.",
      "status": "downloaded",
      "remote": {
        "repo_id": "mvpeav/kalshi-prop-closes",
        "last_modified": "2026-07-01T17:44:19+00:00",
        "sha": "d9757bb6166a1b55a3ec7ecbf53bc48918ab5be6",
        "tags": [
          "region:us"
        ],
        "remote_bytes": 31253896,
        "remote_size": "29.81 MiB",
        "remote_file_count": 23,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2504
          },
          {
            "path": "prop_closes_2026-06-10.jsonl.gz",
            "bytes": 1033732
          },
          {
            "path": "prop_closes_2026-06-11.jsonl.gz",
            "bytes": 1064937
          },
          {
            "path": "prop_closes_2026-06-12.jsonl.gz",
            "bytes": 1403193
          },
          {
            "path": "prop_closes_2026-06-13.jsonl.gz",
            "bytes": 1659277
          },
          {
            "path": "prop_closes_2026-06-14.jsonl.gz",
            "bytes": 1537293
          },
          {
            "path": "prop_closes_2026-06-15.jsonl.gz",
            "bytes": 1324247
          },
          {
            "path": "prop_closes_2026-06-16.jsonl.gz",
            "bytes": 1537250
          },
          {
            "path": "prop_closes_2026-06-17.jsonl.gz",
            "bytes": 1495617
          },
          {
            "path": "prop_closes_2026-06-18.jsonl.gz",
            "bytes": 1162617
          },
          {
            "path": "prop_closes_2026-06-19.jsonl.gz",
            "bytes": 1469209
          },
          {
            "path": "prop_closes_2026-06-20.jsonl.gz",
            "bytes": 1463528
          },
          {
            "path": "prop_closes_2026-06-21.jsonl.gz",
            "bytes": 1522157
          },
          {
            "path": "prop_closes_2026-06-22.jsonl.gz",
            "bytes": 1694058
          },
          {
            "path": "prop_closes_2026-06-23.jsonl.gz",
            "bytes": 1613149
          },
          {
            "path": "prop_closes_2026-06-24.jsonl.gz",
            "bytes": 1929937
          },
          {
            "path": "prop_closes_2026-06-25.jsonl.gz",
            "bytes": 1327649
          },
          {
            "path": "prop_closes_2026-06-26.jsonl.gz",
            "bytes": 1576866
          },
          {
            "path": "prop_closes_2026-06-27.jsonl.gz",
            "bytes": 1402145
          },
          {
            "path": "prop_closes_2026-06-28.jsonl.gz",
            "bytes": 1270826
          },
          {
            "path": "prop_closes_2026-06-29.jsonl.gz",
            "bytes": 1406028
          },
          {
            "path": "prop_closes_2026-06-30.jsonl.gz",
            "bytes": 1564683
          },
          {
            "path": "prop_closes_2026-07-01.jsonl.gz",
            "bytes": 792994
          }
        ]
      },
      "local": {
        "files": 46,
        "bytes": 31254317
      }
    },
    {
      "id": "kalshi_rfq_momentum",
      "name": "Kalshi RFQ Momentum",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "mvpeav/kalshi-rfq-momentum",
      "target": "data/raw/kalshi_rfq_momentum",
      "description": "Small auxiliary Kalshi-related dataset.",
      "status": "downloaded",
      "remote": {
        "repo_id": "mvpeav/kalshi-rfq-momentum",
        "last_modified": "2026-06-19T03:06:02+00:00",
        "sha": "823d29c90f200c6b6692afb6cdecd654f961ab7a",
        "tags": [
          "size_categories:n<1K",
          "modality:text",
          "region:us"
        ],
        "remote_bytes": 3730,
        "remote_size": "3.64 KiB",
        "remote_file_count": 2,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2504
          },
          {
            "path": "fotmob_momentum.json.gz",
            "bytes": 1226
          }
        ]
      },
      "local": {
        "files": 0,
        "bytes": 0
      }
    },
    {
      "id": "mlb_polymarket_kalshi_matched_sample",
      "name": "MLB Polymarket/Kalshi Matched Book Sample",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "Coyevans/mlb-polymarket-kalshi-matched-book-sample",
      "target": "data/raw/mlb_polymarket_kalshi_matched_sample",
      "description": "Matched Polymarket/Kalshi order-book sample for cross-venue price comparison.",
      "status": "downloaded",
      "remote": {
        "repo_id": "Coyevans/mlb-polymarket-kalshi-matched-book-sample",
        "last_modified": "2026-06-23T16:35:04+00:00",
        "sha": "9e2dec0ec1daa5a8fb4b699012356df825b4d6fd",
        "tags": [
          "license:cc-by-nc-4.0",
          "size_categories:1K<n<10K",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:polars",
          "library:mlcroissant",
          "region:us",
          "prediction-markets",
          "sports-betting",
          "polymarket",
          "kalshi",
          "mlb",
          "order-book"
        ],
        "remote_bytes": 255680,
        "remote_size": "249.69 KiB",
        "remote_file_count": 13,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2504
          },
          {
            "path": "README.md",
            "bytes": 2867
          },
          {
            "path": "SCHEMA.txt",
            "bytes": 4555
          },
          {
            "path": "__pycache__/upload_to_hf.cpython-311.pyc",
            "bytes": 3644
          },
          {
            "path": "__pycache__/upload_to_kaggle.cpython-311.pyc",
            "bytes": 4684
          },
          {
            "path": "__pycache__/upload_to_zenodo.cpython-311.pyc",
            "bytes": 6151
          },
          {
            "path": "mlb_matched_sample_ARI-MIN_2026-06-21.csv",
            "bytes": 110091
          },
          {
            "path": "mlb_matched_sample_ARI-MIN_2026-06-21.parquet",
            "bytes": 22748
          },
          {
            "path": "reddit_comment_prep.md",
            "bytes": 3288
          },
          {
            "path": "reddit_post.md",
            "bytes": 4059
          },
          {
            "path": "spread_chart.png",
            "bytes": 83128
          },
          {
            "path": "upload_to_kaggle.py",
            "bytes": 2973
          },
          {
            "path": "upload_to_zenodo.py",
            "bytes": 4988
          }
        ]
      },
      "local": {
        "files": 18,
        "bytes": 156587
      }
    },
    {
      "id": "polymarket_kalshi_scoresync_sample",
      "name": "Polymarket/Kalshi ScoreSync Orderbook Sample",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "Coyevans/polymarket-kalshi-scoresync-orderbook-sample",
      "target": "data/raw/polymarket_kalshi_scoresync_sample",
      "description": "Small cross-venue order-book sample aligned to sports score events.",
      "status": "downloaded",
      "remote": {
        "repo_id": "Coyevans/polymarket-kalshi-scoresync-orderbook-sample",
        "last_modified": "2026-06-26T13:07:59+00:00",
        "sha": "00bfbde3e77540d08996413ba3a5d5f5aee34261",
        "tags": [
          "license:cc-by-nc-4.0",
          "size_categories:1K<n<10K",
          "format:csv",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:polars",
          "library:mlcroissant",
          "region:us",
          "prediction-markets",
          "polymarket",
          "kalshi",
          "order-book",
          "market-microstructure",
          "sports",
          "backtesting"
        ],
        "remote_bytes": 192826,
        "remote_size": "188.31 KiB",
        "remote_file_count": 5,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2504
          },
          {
            "path": "DATA_CARD.json",
            "bytes": 1595
          },
          {
            "path": "README.md",
            "bytes": 3362
          },
          {
            "path": "kalshi_scoresync_PHI_at_TOR_2026-01-11.parquet",
            "bytes": 56948
          },
          {
            "path": "kalshi_scoresync_PHI_at_TOR_HIGHLIGHTS.csv",
            "bytes": 128417
          }
        ]
      },
      "local": {
        "files": 10,
        "bytes": 190939
      }
    },
    {
      "id": "polymarket_10000",
      "name": "Polymarket 10,000",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "CK0607/polymarket_10000",
      "target": "data/raw/polymarket_10000",
      "description": "Small Polymarket tabular sample.",
      "status": "downloaded",
      "remote": {
        "repo_id": "CK0607/polymarket_10000",
        "last_modified": "2025-06-24T17:23:55+00:00",
        "sha": "3f68600832e8805f67a3166fed74bf2594a3807b",
        "tags": [
          "size_categories:10K<n<100K",
          "format:parquet",
          "modality:image",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us"
        ],
        "remote_bytes": 4754767,
        "remote_size": "4.53 MiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 1794
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 4750512
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 4752723
      }
    },
    {
      "id": "polymarket_clean",
      "name": "Polymarket Clean",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "CK0607/polymarket_clean",
      "target": "data/raw/polymarket_clean",
      "description": "Small cleaned Polymarket tabular sample.",
      "status": "downloaded",
      "remote": {
        "repo_id": "CK0607/polymarket_clean",
        "last_modified": "2025-06-27T18:05:06+00:00",
        "sha": "4996a1a8e4b6c6672ce4f12d8f534b767175f8df",
        "tags": [
          "size_categories:1K<n<10K",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us"
        ],
        "remote_bytes": 85003,
        "remote_size": "83.01 KiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 607
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 81935
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 82959
      }
    },
    {
      "id": "closed_polymarket_2025h1",
      "name": "Closed Polymarket 2025H1",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "CK0607/closed-polymarket-2025H1",
      "target": "data/raw/closed_polymarket_2025h1",
      "description": "Closed Polymarket questions from the first half of 2025.",
      "status": "downloaded",
      "remote": {
        "repo_id": "CK0607/closed-polymarket-2025H1",
        "last_modified": "2025-07-07T16:34:54+00:00",
        "sha": "49052bccb14f2a62e1b20a0400bc8d1850177040",
        "tags": [
          "size_categories:10K<n<100K",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us"
        ],
        "remote_bytes": 2676151,
        "remote_size": "2.55 MiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 570
          },
          {
            "path": "data/train-00000-of-00001.parquet",
            "bytes": 2673120
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 2674107
      }
    },
    {
      "id": "polymarket_dataset_bbasavar",
      "name": "Polymarket Dataset",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "bbasavar/PolymarketDataset",
      "target": "data/raw/polymarket_dataset_bbasavar",
      "description": "Polymarket JSONL dataset packaged for fine-tuning experiments.",
      "status": "downloaded",
      "remote": {
        "repo_id": "bbasavar/PolymarketDataset",
        "last_modified": "2025-12-07T00:34:16+00:00",
        "sha": "d627935246a4551e2fc00e1108ca4d1797f6c8a7",
        "tags": [
          "task_categories:time-series-forecasting",
          "language:en",
          "license:mit",
          "size_categories:10K<n<100K",
          "format:json",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:mlcroissant",
          "library:polars",
          "region:us",
          "polymarket",
          "prediction-market",
          "finance",
          "trading"
        ],
        "remote_bytes": 94712346,
        "remote_size": "90.32 MiB",
        "remote_file_count": 3,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2513
          },
          {
            "path": "README.md",
            "bytes": 2562
          },
          {
            "path": "fine_tune.jsonl",
            "bytes": 94707271
          }
        ]
      },
      "local": {
        "files": 6,
        "bytes": 94710250
      }
    },
    {
      "id": "polymarket_5min_crypto_updown",
      "name": "Polymarket 5-Minute Crypto Up/Down",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "kachoio/polymarket-5-minute-crypto-up-down-markets",
      "target": "data/raw/polymarket_5min_crypto_updown",
      "description": "Polymarket 5-minute crypto up/down market and tick data.",
      "status": "downloaded",
      "remote": {
        "repo_id": "kachoio/polymarket-5-minute-crypto-up-down-markets",
        "last_modified": "2026-06-17T16:38:13+00:00",
        "sha": "42d917dc8e3205dde8ac909792af0cce2d715c9f",
        "tags": [
          "task_categories:time-series-forecasting",
          "task_categories:tabular-classification",
          "license:cc0-1.0",
          "size_categories:10M<n<100M",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:pandas",
          "library:polars",
          "library:mlcroissant",
          "region:us",
          "finance",
          "crypto",
          "prediction-markets",
          "polymarket",
          "order-book",
          "time-series"
        ],
        "remote_bytes": 725023119,
        "remote_size": "691.44 MiB",
        "remote_file_count": 16,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2504
          },
          {
            "path": "README.md",
            "bytes": 7143
          },
          {
            "path": "bnb_markets.parquet",
            "bytes": 2994440
          },
          {
            "path": "bnb_ticks.parquet",
            "bytes": 76690643
          },
          {
            "path": "btc_markets.parquet",
            "bytes": 3921352
          },
          {
            "path": "btc_ticks.parquet",
            "bytes": 182475803
          },
          {
            "path": "doge_markets.parquet",
            "bytes": 2999540
          },
          {
            "path": "doge_ticks.parquet",
            "bytes": 80060676
          },
          {
            "path": "eth_markets.parquet",
            "bytes": 3042915
          },
          {
            "path": "eth_ticks.parquet",
            "bytes": 109211078
          },
          {
            "path": "hype_markets.parquet",
            "bytes": 2986889
          },
          {
            "path": "hype_ticks.parquet",
            "bytes": 74977804
          },
          {
            "path": "sol_markets.parquet",
            "bytes": 3002018
          },
          {
            "path": "sol_ticks.parquet",
            "bytes": 92356363
          },
          {
            "path": "xrp_markets.parquet",
            "bytes": 2994280
          },
          {
            "path": "xrp_ticks.parquet",
            "bytes": 87299671
          }
        ]
      },
      "local": {
        "files": 32,
        "bytes": 725022648
      }
    },
    {
      "id": "polymarket_minute_parquet",
      "name": "Polymarket Minute Parquet",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "Mithilss/polymarket_minute_parquet",
      "target": "data/raw/polymarket_minute_parquet",
      "description": "Minute-level Polymarket parquet mirror.",
      "status": "downloaded",
      "remote": {
        "repo_id": "Mithilss/polymarket_minute_parquet",
        "last_modified": "2025-12-27T04:50:42+00:00",
        "sha": "84509739a1e4ca397af7e811d7042735fc365344",
        "tags": [
          "size_categories:100M<n<1B",
          "format:parquet",
          "format:optimized-parquet",
          "modality:text",
          "library:datasets",
          "library:dask",
          "library:polars",
          "library:mlcroissant",
          "region:us"
        ],
        "remote_bytes": 1432132468,
        "remote_size": "1.33 GiB",
        "remote_file_count": 20,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 527
          },
          {
            "path": "data/7397d2df-6622-4860-8797-6803b5740470.parquet",
            "bytes": 8043
          },
          {
            "path": "data/train-00000-of-00017.parquet",
            "bytes": 44720136
          },
          {
            "path": "data/train-00001-of-00017.parquet",
            "bytes": 44566730
          },
          {
            "path": "data/train-00002-of-00017.parquet",
            "bytes": 89573575
          },
          {
            "path": "data/train-00003-of-00017.parquet",
            "bytes": 89352774
          },
          {
            "path": "data/train-00004-of-00017.parquet",
            "bytes": 89417491
          },
          {
            "path": "data/train-00005-of-00017.parquet",
            "bytes": 89805514
          },
          {
            "path": "data/train-00006-of-00017.parquet",
            "bytes": 89481450
          },
          {
            "path": "data/train-00007-of-00017.parquet",
            "bytes": 89514284
          },
          {
            "path": "data/train-00008-of-00017.parquet",
            "bytes": 89250901
          },
          {
            "path": "data/train-00009-of-00017.parquet",
            "bytes": 89623518
          },
          {
            "path": "data/train-00010-of-00017.parquet",
            "bytes": 89315257
          },
          {
            "path": "data/train-00011-of-00017.parquet",
            "bytes": 89654677
          },
          {
            "path": "data/train-00012-of-00017.parquet",
            "bytes": 89713065
          },
          {
            "path": "data/train-00013-of-00017.parquet",
            "bytes": 89432154
          },
          {
            "path": "data/train-00014-of-00017.parquet",
            "bytes": 88949374
          },
          {
            "path": "data/train-00015-of-00017.parquet",
            "bytes": 89615893
          },
          {
            "path": "data/train-00016-of-00017.parquet",
            "bytes": 90134644
          }
        ]
      },
      "local": {
        "files": 40,
        "bytes": 1432132535
      }
    },
    {
      "id": "polymarket_full_sii",
      "name": "Polymarket Historical Data",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "medium",
      "source": "SII-WANGZJ/Polymarket_data",
      "target": "data/raw/polymarket_full_sii",
      "description": "Large historical Polymarket dataset; useful for deep market microstructure research but over 160 GiB.",
      "status": "skipped",
      "reason": "large_opt_in",
      "remote": {
        "repo_id": "SII-WANGZJ/Polymarket_data",
        "last_modified": "2026-05-04T21:37:05+00:00",
        "sha": "8be4197cd2f71ee03f61af58cd375fe496ef40d7",
        "tags": [
          "size_categories:1B<n<10B",
          "modality:tabular",
          "modality:text",
          "region:us"
        ],
        "remote_bytes": 170846141010,
        "remote_size": "159.11 GiB",
        "remote_file_count": 8,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2461
          },
          {
            "path": "README.md",
            "bytes": 18323
          },
          {
            "path": "markets.parquet",
            "bytes": 165315875
          },
          {
            "path": "orderfilled_part1.parquet",
            "bytes": 38370288935
          },
          {
            "path": "orderfilled_part2.parquet",
            "bytes": 40792065925
          },
          {
            "path": "quant.parquet",
            "bytes": 27351315489
          },
          {
            "path": "trades.parquet",
            "bytes": 27995657622
          },
          {
            "path": "users.parquet",
            "bytes": 36171476380
          }
        ]
      },
      "local": {
        "files": 0,
        "bytes": 0
      }
    },
    {
      "id": "polymarket_crypto_derivatives",
      "name": "Polymarket Crypto Derivatives",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "trentmkelly/polymarket_crypto_derivatives",
      "target": "data/raw/polymarket_crypto_derivatives",
      "description": "Large Polymarket crypto derivatives mirror with many small files; opt-in because of size and file count.",
      "status": "skipped",
      "reason": "large_opt_in",
      "remote": {
        "repo_id": "trentmkelly/polymarket_crypto_derivatives",
        "last_modified": "2026-05-12T21:50:15+00:00",
        "sha": "6be20463ce33795178c121e7bd15ed428904b5bd",
        "tags": [
          "task_categories:tabular-classification",
          "task_categories:time-series-forecasting",
          "language:en",
          "license:cc-by-sa-4.0",
          "region:us",
          "polymarket",
          "crypto",
          "market-data",
          "order-book",
          "parquet",
          "zstd"
        ],
        "remote_bytes": 19225211908,
        "remote_size": "17.90 GiB",
        "remote_file_count": 53010,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2504
          },
          {
            "path": "README.md",
            "bytes": 2267
          },
          {
            "path": "btc15m_market1402567_2026-02-21_15-45-00_all/book_levels.parquet",
            "bytes": 178967
          },
          {
            "path": "btc15m_market1402567_2026-02-21_15-45-00_all/events.parquet",
            "bytes": 232952
          },
          {
            "path": "btc15m_market1402567_2026-02-21_15-45-00_all/steps.parquet",
            "bytes": 106028
          },
          {
            "path": "btc15m_market1402664_2026-02-21_16-00-00_all/book_levels.parquet",
            "bytes": 395249
          },
          {
            "path": "btc15m_market1402664_2026-02-21_16-00-00_all/events.parquet",
            "bytes": 449079
          },
          {
            "path": "btc15m_market1402664_2026-02-21_16-00-00_all/steps.parquet",
            "bytes": 241829
          },
          {
            "path": "btc15m_market1402780_2026-02-21_16-15-00_all/book_levels.parquet",
            "bytes": 494816
          },
          {
            "path": "btc15m_market1402780_2026-02-21_16-15-00_all/events.parquet",
            "bytes": 587283
          },
          {
            "path": "btc15m_market1402780_2026-02-21_16-15-00_all/steps.parquet",
            "bytes": 244817
          },
          {
            "path": "btc15m_market1402886_2026-02-21_16-30-00_all/book_levels.parquet",
            "bytes": 594522
          },
          {
            "path": "btc15m_market1402886_2026-02-21_16-30-00_all/events.parquet",
            "bytes": 761029
          },
          {
            "path": "btc15m_market1402886_2026-02-21_16-30-00_all/steps.parquet",
            "bytes": 291285
          },
          {
            "path": "btc15m_market1402904_2026-02-21_16-45-00_all/book_levels.parquet",
            "bytes": 540313
          },
          {
            "path": "btc15m_market1402904_2026-02-21_16-45-00_all/events.parquet",
            "bytes": 775777
          },
          {
            "path": "btc15m_market1402904_2026-02-21_16-45-00_all/steps.parquet",
            "bytes": 286569
          },
          {
            "path": "btc15m_market1403078_2026-02-21_17-00-00_all/book_levels.parquet",
            "bytes": 483102
          },
          {
            "path": "btc15m_market1403078_2026-02-21_17-00-00_all/events.parquet",
            "bytes": 699434
          },
          {
            "path": "btc15m_market1403078_2026-02-21_17-00-00_all/steps.parquet",
            "bytes": 265666
          },
          {
            "path": "btc15m_market1403101_2026-02-21_17-15-00_all/book_levels.parquet",
            "bytes": 535903
          },
          {
            "path": "btc15m_market1403101_2026-02-21_17-15-00_all/events.parquet",
            "bytes": 736549
          },
          {
            "path": "btc15m_market1403101_2026-02-21_17-15-00_all/steps.parquet",
            "bytes": 290004
          },
          {
            "path": "btc15m_market1403128_2026-02-21_17-30-00_all/book_levels.parquet",
            "bytes": 399899
          },
          {
            "path": "btc15m_market1403128_2026-02-21_17-30-00_all/events.parquet",
            "bytes": 482603
          }
        ]
      },
      "local": {
        "files": 0,
        "bytes": 0
      }
    },
    {
      "id": "polymarket_crypto_updown",
      "name": "Polymarket Crypto Up/Down",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "aliplayer1/polymarket-crypto-updown",
      "target": "data/raw/polymarket_crypto_updown",
      "description": "Large Polymarket crypto up/down dataset; opt-in because it is tens of GiB.",
      "status": "skipped",
      "reason": "large_opt_in",
      "remote": {
        "repo_id": "aliplayer1/polymarket-crypto-updown",
        "last_modified": "2026-04-26T18:05:40+00:00",
        "sha": "ba1ad37cbcdd720cced20f1cdc97c2cf347cad6c",
        "tags": [
          "task_categories:time-series-forecasting",
          "task_categories:tabular-classification",
          "language:en",
          "license:mit",
          "size_categories:1B<n<10B",
          "format:parquet",
          "modality:tabular",
          "modality:text",
          "library:datasets",
          "library:dask",
          "library:polars",
          "library:mlcroissant",
          "region:us",
          "polymarket",
          "prediction-markets",
          "crypto",
          "on-chain",
          "orderbook",
          "bitcoin",
          "ethereum",
          "defi",
          "finance"
        ],
        "remote_bytes": 29079226982,
        "remote_size": "27.08 GiB",
        "remote_file_count": 2271,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2504
          },
          {
            "path": "README.md",
            "bytes": 6377
          },
          {
            "path": "data/.gap_manifest.json",
            "bytes": 493283
          },
          {
            "path": "data/heartbeats/part-0.parquet",
            "bytes": 1675177
          },
          {
            "path": "data/markets.parquet",
            "bytes": 13420321
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=1-hour/part-0.parquet",
            "bytes": 193956951
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=1-hour/part-ws-1777225214232-000.parquet",
            "bytes": 27515
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=1-hour/part-ws-1777226633914-000.parquet",
            "bytes": 34041
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=15-minute/part-0.parquet",
            "bytes": 623736700
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=15-minute/part-ws-1777225214341-000.parquet",
            "bytes": 71123
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=15-minute/part-ws-1777226634242-000.parquet",
            "bytes": 267759
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=15-minute/part-ws-1777226634242-001.parquet",
            "bytes": 4937
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=4-hour/part-0.parquet",
            "bytes": 165500781
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=4-hour/part-ws-1777225214101-000.parquet",
            "bytes": 31934
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=4-hour/part-ws-1777226633599-000.parquet",
            "bytes": 71389
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=5-minute/part-0.parquet",
            "bytes": 930476250
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=5-minute/part-ws-1777225213898-000.parquet",
            "bytes": 85627
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=5-minute/part-ws-1777226633160-000.parquet",
            "bytes": 143460
          },
          {
            "path": "data/orderbook/crypto=BNB/timeframe=5-minute/part-ws-1777226633160-001.parquet",
            "bytes": 8122
          },
          {
            "path": "data/orderbook/crypto=BTC/timeframe=1-hour/part-0.parquet",
            "bytes": 637485202
          },
          {
            "path": "data/orderbook/crypto=BTC/timeframe=1-hour/part-ws-1777225446644-000.parquet",
            "bytes": 386514
          },
          {
            "path": "data/orderbook/crypto=BTC/timeframe=1-hour/part-ws-1777225446644-001.parquet",
            "bytes": 425493
          },
          {
            "path": "data/orderbook/crypto=BTC/timeframe=1-hour/part-ws-1777225446644-002.parquet",
            "bytes": 505401
          },
          {
            "path": "data/orderbook/crypto=BTC/timeframe=1-hour/part-ws-1777225446644-003.parquet",
            "bytes": 502877
          },
          {
            "path": "data/orderbook/crypto=BTC/timeframe=1-hour/part-ws-1777225446644-004.parquet",
            "bytes": 308875
          }
        ]
      },
      "local": {
        "files": 0,
        "bytes": 0
      }
    },
    {
      "id": "polymarket_onchain_v1",
      "name": "Polymarket On-Chain v1",
      "stream": "market_data",
      "kind": "hf_dataset",
      "priority": "low",
      "source": "moose-code/polymarket-onchain-v1",
      "target": "data/raw/polymarket_onchain_v1",
      "description": "Large on-chain Polymarket data mirror; opt-in because it is over 120 GiB.",
      "status": "skipped",
      "reason": "large_opt_in",
      "remote": {
        "repo_id": "moose-code/polymarket-onchain-v1",
        "last_modified": "2026-07-01T11:52:29+00:00",
        "sha": "7746e0aa1a8c9130380d0d9e8654f5681690e083",
        "tags": [
          "license:cc-by-4.0",
          "size_categories:1B<n<10B",
          "modality:image",
          "modality:text",
          "region:us",
          "polymarket",
          "prediction-markets",
          "crypto",
          "defi",
          "onchain",
          "polygon",
          "duckdb"
        ],
        "remote_bytes": 127187515308,
        "remote_size": "118.45 GiB",
        "remote_file_count": 390,
        "sample_files": [
          {
            "path": ".gitattributes",
            "bytes": 2504
          },
          {
            "path": "README.md",
            "bytes": 7314
          },
          {
            "path": "SNAPSHOT.json",
            "bytes": 1216
          },
          {
            "path": "collateral.parquet",
            "bytes": 153
          },
          {
            "path": "condition.parquet",
            "bytes": 125920014
          },
          {
            "path": "fee_refunded/year=2026/month=01.parquet",
            "bytes": 2485833773
          },
          {
            "path": "fee_refunded/year=2026/month=02.parquet",
            "bytes": 6288190734
          },
          {
            "path": "fee_refunded/year=2026/month=03.parquet",
            "bytes": 10777956286
          },
          {
            "path": "fee_refunded/year=2026/month=04.parquet",
            "bytes": 7911372535
          },
          {
            "path": "fixed_product_market_maker.parquet",
            "bytes": 4454044
          },
          {
            "path": "fpmm_funding_addition.parquet",
            "bytes": 8453155
          },
          {
            "path": "fpmm_funding_removal.parquet",
            "bytes": 7639633
          },
          {
            "path": "fpmm_pool_membership.parquet",
            "bytes": 1433456
          },
          {
            "path": "fpmm_transaction/year=2020/month=09.parquet",
            "bytes": 4777
          },
          {
            "path": "fpmm_transaction/year=2020/month=10.parquet",
            "bytes": 96007
          },
          {
            "path": "fpmm_transaction/year=2020/month=11.parquet",
            "bytes": 1130345
          },
          {
            "path": "fpmm_transaction/year=2020/month=12.parquet",
            "bytes": 686071
          },
          {
            "path": "fpmm_transaction/year=2021/month=01.parquet",
            "bytes": 1563164
          },
          {
            "path": "fpmm_transaction/year=2021/month=02.parquet",
            "bytes": 1754810
          },
          {
            "path": "fpmm_transaction/year=2021/month=03.parquet",
            "bytes": 2176417
          },
          {
            "path": "fpmm_transaction/year=2021/month=04.parquet",
            "bytes": 3005253
          },
          {
            "path": "fpmm_transaction/year=2021/month=05.parquet",
            "bytes": 3215332
          },
          {
            "path": "fpmm_transaction/year=2021/month=06.parquet",
            "bytes": 3836490
          },
          {
            "path": "fpmm_transaction/year=2021/month=07.parquet",
            "bytes": 1780972
          },
          {
            "path": "fpmm_transaction/year=2021/month=08.parquet",
            "bytes": 1664976
          }
        ]
      },
      "local": {
        "files": 0,
        "bytes": 0
      }
    },
    {
      "id": "autocast",
      "name": "Autocast",
      "stream": "dataset_construction",
      "kind": "git_repo",
      "priority": "high",
      "source": "https://github.com/andyzoujm/autocast.git",
      "target": "data/raw/autocast",
      "description": "Forecasting questions and dated news corpus from forecasting tournaments.",
      "status": "cloned",
      "commit": "f7907d45dd3f5e58d70834f2d7770c404ef4baba",
      "local": {
        "files": 63,
        "bytes": 5443477
      }
    },
    {
      "id": "mirai",
      "name": "MIRAI",
      "stream": "dataset_construction",
      "kind": "git_repo",
      "priority": "medium",
      "source": "https://github.com/yecchen/MIRAI.git",
      "target": "data/raw/mirai",
      "description": "International event forecasting benchmark with tool-use environment.",
      "status": "cloned",
      "commit": "badda88c4ddc48992708c783d684a3feebf15a61",
      "local": {
        "files": 83,
        "bytes": 9194339
      }
    },
    {
      "id": "polybench",
      "name": "PolyBench",
      "stream": "dataset_construction",
      "kind": "git_repo",
      "priority": "medium",
      "source": "https://github.com/PolyBench/PolyBench.git",
      "target": "data/raw/polybench",
      "description": "Polymarket live market benchmark with CLOB/news snapshots if released in repository.",
      "status": "cloned",
      "commit": "280fbb3b29d925f1167005ee03b6116f78761504",
      "local": {
        "files": 53,
        "bytes": 2596135
      }
    },
    {
      "id": "prophet",
      "name": "PROPHET",
      "stream": "dataset_construction",
      "kind": "git_repo",
      "priority": "medium",
      "source": "https://github.com/TZWwww/PROPHET.git",
      "target": "data/raw/prophet",
      "description": "Inferable future forecasting benchmark with causal intervened likelihood estimation.",
      "status": "cloned",
      "commit": "6069c44495be31c59174ec07367e7bba9d5dfe08",
      "local": {
        "files": 41,
        "bytes": 67905
      }
    },
    {
      "id": "openforecast",
      "name": "OpenForecast",
      "stream": "dataset_construction",
      "kind": "manual",
      "priority": "medium",
      "source": "https://github.com/miaomiao1215/Openforecast",
      "target": "data/raw/openforecast",
      "description": "Open-ended event forecasting dataset hosted on Google Drive; manual/gdown download may be required.",
      "status": "skipped",
      "reason": "manual_or_disabled",
      "local": {
        "files": 0,
        "bytes": 0
      }
    },
    {
      "id": "halawi_llm_forecasting",
      "name": "Approaching Human-Level Forecasting code/data",
      "stream": "social_forecasting",
      "kind": "git_repo",
      "priority": "medium",
      "source": "https://github.com/dannyallover/llm_forecasting.git",
      "target": "data/raw/halawi_llm_forecasting",
      "description": "Code and any available data/artifacts for the Halawi et al. forecasting system.",
      "status": "cloned",
      "commit": "85e72d026fc0703f16e7aef6faaec99cc5f4041d",
      "local": {
        "files": 83,
        "bytes": 11042841
      }
    }
  ]
}