This guide teaches you how to load Solana blockchain data directly from HuggingFace using the
datasets library. You'll learn how to query with DuckDB, convert to pandas,
and analyze blockchain data without managing downloads yourself.
SolArchive is now available on HuggingFace at solarchive/solarchive. This makes it incredibly easy to load Solana blockchain data directly into Python without managing downloads or storage yourself.
The dataset contains three data splits organized by partition:
txs/2025-11-01)tokens/2025-11)accounts/2025-11)You'll need Python 3.8+ and the following packages:
pip install datasets duckdb pandas The simplest way to load transaction data is using the HuggingFace datasets library:
from datasets import load_dataset
# Load a specific transaction partition (one day)
ds = load_dataset(
"solarchive/solarchive",
data_dir="txs/2025-11-01",
split="train"
)
print(f"Loaded {len(ds)} transactions")
print(f"Columns: {ds.column_names}")
# View first transaction
print(ds[0]) Loading token metadata or account snapshots works the same way:
# Load token metadata for a month
tokens = load_dataset(
"solarchive/solarchive",
data_dir="tokens/2021-04",
split="train"
)
# Load account snapshots for a month
accounts = load_dataset(
"solarchive/solarchive",
data_dir="accounts/2021-04",
split="train"
)
print(f"Loaded {len(tokens)} token records")
print(f"Loaded {len(accounts)} account snapshots") Once loaded, you can query the data efficiently with DuckDB:
import duckdb
# Load transactions
ds = load_dataset(
"solarchive/solarchive",
data_dir="txs/2025-11-01",
split="train"
)
# Convert to DuckDB relation for SQL queries
con = duckdb.connect()
rel = con.from_arrow(ds.data.table)
# Find all failed transactions
failed = rel.filter("status = 'Failed'").to_df()
print(f"Found {len(failed)} failed transactions")
# Calculate average fee
avg_fee = rel.aggregate("avg(fee / 1e9) as avg_fee_sol").fetchone()
print(f"Average fee: {avg_fee[0]:.6f} SOL")
# Find transactions with high compute usage
high_compute = rel.filter("compute_units_consumed > 1000000").to_df()
print(f"High compute transactions: {len(high_compute)}") You can easily convert to pandas for further analysis:
import pandas as pd
# Load and convert to pandas DataFrame
ds = load_dataset(
"solarchive/solarchive",
data_dir="txs/2025-11-01",
split="train"
)
df = ds.to_pandas()
# Now use familiar pandas operations
print(df.head())
print(df.describe())
# Filter and analyze
successful = df[df['status'] == 'Success']
print(f"Success rate: {len(successful) / len(df) * 100:.2f}%") To analyze data across multiple days or months, you can load and concatenate partitions:
from datasets import load_dataset, concatenate_datasets
# Load multiple days of transactions
partitions = ['2025-11-01', '2025-11-02', '2025-11-03']
datasets = [
load_dataset(
"solarchive/solarchive",
data_dir=f"txs/{partition}",
split="train"
)
for partition in partitions
]
# Combine into single dataset
combined = concatenate_datasets(datasets)
print(f"Total transactions: {len(combined)}") You can discover available partitions programmatically:
from huggingface_hub import HfFileSystem
# List all available partitions
fs = HfFileSystem()
files = fs.ls("datasets/solarchive/solarchive", detail=False)
# Extract transaction partitions
tx_partitions = sorted(set(
f.split('/')[2]
for f in files
if f.startswith('datasets/solarchive/solarchive/txs/')
))
print(f"Available transaction partitions: {len(tx_partitions)}")
print(f"First 5: {tx_partitions[:5]}")
print(f"Last 5: {tx_partitions[-5:]}") For very large partitions, use streaming to avoid loading everything into memory:
# Stream instead of downloading all at once
ds = load_dataset(
"solarchive/solarchive",
data_dir="txs/2025-11-01",
split="train",
streaming=True
)
# Process in batches
for i, batch in enumerate(ds.iter(batch_size=1000)):
print(f"Processing batch {i}: {len(batch['signature'])} transactions")
# Your processing logic here
if i >= 10: # Process only first 10 batches
break Here's a complete example analyzing new token launches in a specific month:
import duckdb
from datasets import load_dataset
# Load token data
tokens = load_dataset(
"solarchive/solarchive",
data_dir="tokens/2021-04",
split="train"
)
# Query with DuckDB
con = duckdb.connect()
rel = con.from_arrow(tokens.data.table)
# Find unique tokens with names
result = rel.query("""
SELECT DISTINCT ON (mint)
mint,
name,
symbol,
block_timestamp,
is_nft
FROM tokens_rel
WHERE name IS NOT NULL AND name != ''
ORDER BY mint, block_timestamp ASC
""", "tokens_rel").to_df()
print(f"Unique tokens in April 2021: {len(result)}")
print(f"NFTs: {result['is_nft'].sum()}")
print(f"Fungible tokens: {(~result['is_nft']).sum()}")
# Most common symbols
print("\nTop 10 symbols:")
print(result['symbol'].value_counts().head(10))