A Coding Hands-On on FineWeb for Streaming, Filtering, Deduplication, Tokenization, and Large-Scale Web Corpus Analytics

A Coding Hands-On on FineWeb for Streaming, Filtering, Deduplication, Tokenization, and Large-Scale Web Corpus Analytics


df[“domain”] = df[“url”].apply(lambda u: urlparse(u).netloc.replace(“www.”, “”) if isinstance(u, str) else “?”)
top_domains = df[“domain”].value_counts().head(15)
print(“\n— Top 15 domains in sample —“)
print(top_domains)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes[0, 0].hist(df[“token_count”].clip(upper=4000), bins=50, color=”#7b2d26″)
axes[0, 0].set_title(“Token count per document (gpt2)”)
axes[0, 0].set_xlabel(“tokens”); axes[0, 0].set_ylabel(“docs”)
axes[0, 1].hist(df[“language_score”], bins=40, color=”#2d5d7b”)
axes[0, 1].axvline(0.65, color=”red”, ls=”–“, label=”FineWeb cutoff 0.65”)
axes[0, 1].set_title(“fastText English language score”)
axes[0, 1].set_xlabel(“score”); axes[0, 1].legend()
axes[1, 0].hist(df[“chars_per_token”].clip(upper=8), bins=40, color=”#3f7b2d”)
axes[1, 0].set_title(“Characters per token (compression)”)
axes[1, 0].set_xlabel(“chars / token”)
top_domains.iloc[::-1].plot(kind=”barh”, ax=axes[1, 1], color=”#7b5d2d”)
axes[1, 1].set_title(“Top domains”)
plt.tight_layout()
plt.show()
print(“\n” + “=” * 70)
print(“SUMMARY”)
print(“=” * 70)
print(f”Docs streamed : {len(df):,}”)
print(f”Total gpt2 tokens : {df[‘token_count’].sum():,}”)
print(f”Median tokens/doc : {int(df[‘token_count’].median())}”)
print(f”Unique domains : {df[‘domain’].nunique():,}”)
print(f”Mean language_score : {df[‘language_score’].mean():.3f}”)
print(f”Near-duplicate pairs : {len(dup_pairs)}”)
print(f”Docs flagged by filters : {(pd.Series(results) != ‘kept’).sum()} / {len(results)}”)
print(“\nNext steps:”)
print(” • Swap name=”sample-10BT” for a real crawl, e.g. name=”CC-MAIN-2024-10″”)
print(” • Raise N_DOCS for stronger statistics”)
print(” • Use the full datatrove pipeline to reproduce FineWeb end-to-end”)



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *

Pin It on Pinterest