Dataset Viewer
Auto-converted to Parquet Duplicate
code
stringlengths
72
8.78M
code_en
stringlengths
72
8.78M
language
stringclasses
1 value
file_path
stringlengths
36
164
license
stringclasses
1 value
token_count
int64
26
8.41M
# -*- coding: utf-8 -*- from collections import OrderedDict from ipywidgets import Widget, Tab class ExtendedTab(Tab): """ A Tab subclass that allows to add/access/select/replace/remove children by name. There can be only one tab for any given name. Example: import time t = Extend...
# -*- coding: utf-8 -*- from collections import OrderedDict from ipywidgets import Widget, Tab class ExtendedTab(Tab): """ A Tab subclass that allows to add/access/select/replace/remove children by name. There can be only one tab for any given name. Example: import time t = Extend...
en
002440303_deeplook-ipyrest_extendedtab_093e2be98d4c.py
unknown
717
# ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # -------------------------------------------------------------------------...
# ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # -------------------------------------------------------------------------...
en
005599174_eslambakr-HRS_benchmark_ms_deform_attn_e958c034cd2c.py
unknown
2,162
""" Notice : 神兽保佑 ,测试一次通过 // // ┏┛ ┻━━━━━┛ ┻┓ // ┃       ┃ // ┃   ━   ┃ // ┃ ┳┛  ┗┳ ┃ // ┃       ┃ // ┃   ┻   ┃ // ┃       ┃ // ┗━┓   ┏━━━┛ // ┃   ┃ Author: somewheve // ┃   ┃ Datetime: 2019/7/3 下午8:46 ---> 无知即是罪恶 // ┃   ┗━━━━━━━━━┓ // ┃   ...
""" Notice : 神兽保佑 ,测试一次通过 // // ┏┛ ┻━━━━━┛ ┻┓ // ┃       ┃ // ┃   ━   ┃ // ┃ ┳┛  ┗┳ ┃ // ┃       ┃ // ┃   ┻   ┃ // ┃       ┃ // ┗━┓   ┏━━━┛ // ┃   ┃ Author: somewheve // ┃   ┃ Datetime: 2019/7/3 下午8:46 ---> 无知即是罪恶 // ┃   ┗━━━━━━━━━┓ // ┃   ...
en
004828635_ctpbee-ctpbee_local_position_cba89669d079.py
unknown
6,624
""" Reddit comments --------------- A collection of up to ~1.5 billion Reddit comments posted from October 2007 through May 2015. Records include the following key fields (plus a few others): - ``body``: Full text of the comment. - ``created_utc``: Date on which the comment was posted. - ``subreddit``: S...
""" Reddit comments --------------- A collection of up to ~1.5 billion Reddit comments posted from October 2007 through May 2015. Records include the following key fields (plus a few others): - ``body``: Full text of the comment. - ``created_utc``: Date on which the comment was posted. - ``subreddit``: S...
en
005639457_chartbeat-labs-textacy_reddit_comments_dd41d70f3bec.py
unknown
4,005
# Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from django.http import HttpRequest def source() -> str: request = HttpRequest() return request.GET["bad"] def sink(argument: st...
# Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from django.http import HttpRequest def source() -> str: request = HttpRequest() return request.GET["bad"] def sink(argument: st...
en
005513877_facebook-pyre-check_taint_9fd251ea2de2.py
unknown
98
"""Spatial Dissimilarity Index.""" __author__ = "Renan X. Cortes <renanc@ucr.edu>, Sergio J. Rey <sergio.rey@ucr.edu> and Elijah Knaap <elijah.knaap@ucr.edu>" import libpysal import numpy as np from libpysal.weights import Queen from .._base import SingleGroupIndex, SpatialExplicitIndex from .dissim import _dissim ...
"""Spatial Dissimilarity Index.""" __author__ = "Renan X. Cortes <renanc@ucr.edu>, Sergio J. Rey <sergio.rey@ucr.edu> and Elijah Knaap <elijah.knaap@ucr.edu>" import libpysal import numpy as np from libpysal.weights import Queen from .._base import SingleGroupIndex, SpatialExplicitIndex from .dissim import _dissim ...
en
005596770_pysal-segregation_spatial_dissim_c7cbba1b3cc7.py
unknown
1,408
# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to...
# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to...
en
003296220_mindspore-ai-models_pointTransfomrer_4aa2cdd24484.py
unknown
3,593
""" Conduct searches against all registry context. """ import math from operator import itemgetter from flask import abort from stringscore import liquidmetal from text_unidecode import unidecode import features from app import app, authentication, avatar from auth import scopes from auth.auth_context import get_aut...
""" Conduct searches against all registry context. """ import math from operator import itemgetter from flask import abort from stringscore import liquidmetal from text_unidecode import unidecode import features from app import app, authentication, avatar from auth import scopes from auth.auth_context import get_aut...
en
002995759_quay-quay_search_1eeecbd30b46.py
unknown
4,126
from pygments.lexer import RegexLexer, words, include from pygments.token import * from pygments.style import Style from pygments.token import Keyword, Name, Comment, String, Error, Number, Operator, Generic, Text from sphinxcontrib.domaintools import custom_domain import re __all__ = ['MinilangLexer'] class Minilang...
from pygments.lexer import RegexLexer, words, include from pygments.token import * from pygments.style import Style from pygments.token import Keyword, Name, Comment, String, Error, Number, Operator, Generic, Text from sphinxcontrib.domaintools import custom_domain import re __all__ = ['MinilangLexer'] class Minilang...
en
003598869_wrapl-minilang_minilang_5aa1f4fdd5b5.py
unknown
1,326
import os import numpy as np import yaml import re from edflow.data.dataset_mixin import DatasetMixin from edflow.util import retrieve, get_obj_from_str, pp2mkdtable, pop_keypath from edflow.util import walk, set_value, edprint from edflow.data.believers.meta_loaders import DEFAULT_LOADERS try: from IPython impor...
import os import numpy as np import yaml import re from edflow.data.dataset_mixin import DatasetMixin from edflow.util import retrieve, get_obj_from_str, pp2mkdtable, pop_keypath from edflow.util import walk, set_value, edprint from edflow.data.believers.meta_loaders import DEFAULT_LOADERS try: from IPython impor...
en
000899950_pesser-edflow_meta_09fbed6837d7.py
unknown
2,808
import logging import os from restic_compose_backup import utils logger = logging.getLogger(__name__) def run(image: str = None, command: str = None, volumes: dict = None, environment: dict = None, labels: dict = None, source_container_id: str = None): logger.info("Starting backup container") client...
import logging import os from restic_compose_backup import utils logger = logging.getLogger(__name__) def run(image: str = None, command: str = None, volumes: dict = None, environment: dict = None, labels: dict = None, source_container_id: str = None): logger.info("Starting backup container") client...
en
005414885_ZettaIO-restic-compose-backup_backup_runner_9c6cd421a308.py
unknown
517
End of preview. Expand in Data Studio

Language Decoded | Multilingual Code Dataset

Experiment and proposed paper title: Language Decoded: Exploring the Impact of Native Code on Multilingual Models

Note (2026-05-18): Current Phase 3 configs use the short condition-* namespace and include 103k, 20k, and 5k sizes for Conditions 1--2. Phase 2 configs remain available under the phase-2-the-stack-v1-* namespace for reproducibility.

Multilingual Python code datasets for the Language Decoded project (part of Cohere's Tiny Aya Expedition). See legesher/language-decoded-experiments for the canonical project description, the full experimental ladder, and the paper-grade evaluation results.

Research Question

How does fine-tuning on non-English code — whether transpiled, mixed-native, or fully translated — affect a model's multilingual reasoning, and how does that impact differ from fine-tuning on English code?

Prior work (Aryabumi et al., 2024 -- "To Code or Not to Code") demonstrated that including English code in pre-training data improves downstream reasoning performance by approximately 8%. However, that study only tested English code. This dataset enables the natural follow-up: how does the impact of non-English code differ from English code, and how does that vary by language, structure, and corpus construction?

Dataset Description

This dataset provides filtered, quality-controlled Python source code in multiple configurations: the original English (cond-1); three Legesher-transpiled variants (cond-2 zh/es/ur, with Python's reserved words translated to the target language); a community-collected raw native-source corpus (cond-3); strictly native code (cond-4, pending); and a model-translated set (cond-5, where c4ai-aya-expanse-32b translates everything translatable inside the file). Python source for Conditions 1, 2, and 5 is drawn from bigcode/the-stack-v2-dedup (Python subset) for the current Phase 3 configs; the legacy phase-2-the-stack-v1-* configs are sourced from The Stack v1 (non-dedup). Conditions 3 and 4 draw on natively-authored or community-contributed code (see those conditions below).

Source-file control

Cond-1, cond-2, and cond-5 all train on the same 5,000-file subset drawn from bigcode/the-stack-v2-dedup (with a parallel 20k subset for the 20k tier). Differences across these conditions reflect the processing pipeline (raw / transpiled / fully translated), not file-quality or content drift. Cond-3 is the deliberate exception — its source files are a different population by design (community-collected from varied online sources, potentially including non-Python files).

Source files for cond-1/2/5 are filtered using:

  • AST-valid Python only (must parse without errors)
  • Permissive licenses only (MIT, Apache-2.0, BSD, etc.)
  • 10--1000 lines of code
  • Minimum 21 GitHub stars
  • No autogenerated files
  • SHA-256 deduplication

Cond-2 variants are produced using Legesher v0.7.3, which translates Python's reserved words (37 keywords, 72 built-in functions, 66 exceptions, plus the numerical system for some target languages) into the target language while preserving code structure and user logic. Cond-5 takes the Legesher-transpiled output and runs it through c4ai-aya-expanse-32b via the Cohere API to translate the remaining content — identifiers, comments, docstrings, string literals, and any other natural-language wording — into the target language. Logic and structure are preserved throughout.

Available Configs

Conditions 1--2 are available in three current Phase 3 sizes: -103k full corpora, -20k random subsets sampled from the corresponding -103k config with seed 42, and -5k compact subsets. Phase 2 -32k configs are still available with the phase-2-the-stack-v1-* prefix. Condition 5 (condition-5-*-c4ai-aya-expanse-32b) is the model-translated set — currently 5k only, and raw/pre-cleanup (see the note above).

Config Condition Language Description Train Val
condition-1-en-103k 1 (control) English Unmodified filtered Python 93,549 10,395
condition-1-en-20k 1 (control) English Random 20k subset of condition-1-en-103k 18,000 2,000
condition-1-en-5k 1 (control) English Compact 5k subset 4,500 500
condition-2-zh-103k 2 Chinese Legesher v0.7.3 reserved-word translation 93,547 10,395
condition-2-zh-20k 2 Chinese Random 20k subset of condition-2-zh-103k 18,000 2,000
condition-2-zh-5k 2 Chinese Compact 5k subset 4,500 500
condition-2-es-103k 2 Spanish Legesher v0.7.3 reserved-word translation 93,547 10,395
condition-2-es-20k 2 Spanish Random 20k subset of condition-2-es-103k 18,000 2,000
condition-2-es-5k 2 Spanish Compact 5k subset 4,500 500
condition-2-ur-103k 2 Urdu Legesher v0.7.3 reserved-word translation 93,547 10,395
condition-2-ur-20k 2 Urdu Random 20k subset of condition-2-ur-103k 18,000 2,000
condition-2-ur-5k 2 Urdu Compact 5k subset 4,500 500
condition-3-zh-5k 3 Chinese Blended: native Chinese code + transpiled Python 4,500 500
condition-4-zh-5k 4 Chinese Strictly native Chinese code 6,553 729
condition-5-ur-5k-c4ai-aya-expanse-32b 5 Urdu Model-translated (full LLM translation via Cohere Aya) — raw, pre-cleanup 4,088 381
condition-5-zh-5k-c4ai-aya-expanse-32b 5 Chinese Model-translated (full LLM translation via Cohere Aya) — raw, pre-cleanup 4,052 381
condition-5-es-5k-c4ai-aya-expanse-32b 5 Spanish Model-translated (full LLM translation via Cohere Aya) — raw, pre-cleanup 4,032 329

Schema

Conditions 1--2

Used by: condition-1-en-*, condition-2-zh-*, condition-2-es-*, condition-2-ur-*

Column Type Description
code string Python source code. For condition-2 configs, this is the Legesher-transpiled (reserved-word translated) version. For condition-1, this is the original English source.
code_en string Original English Python source code. Identical to code for condition-1-en.
language string ISO 639-1 language code: en, ur, zh, or es.
file_path string Original file path in the source dataset.
license string SPDX license identifier for the source file.
token_count int64 Token count computed using the CohereLabs/tiny-aya-base tokenizer.

Condition 5

Used by: condition-5-ur-5k-c4ai-aya-expanse-32b, condition-5-zh-5k-c4ai-aya-expanse-32b, condition-5-es-5k-c4ai-aya-expanse-32b

Condition 5 uses the conditions 1--2 schema plus an idx column. code is the full LLM-translated source (identifiers, strings, comments, and keywords); code_en is the English original. These configs are raw model output — see the note at the top of this card.

Column Type Description
code string Model-translated Python source (full LLM translation via Cohere Aya).
code_en string Original English Python source code.
language string ISO 639-1 language code: ur, zh, or es.
file_path string Original file path in the source dataset.
license string SPDX license identifier for the source file.
idx int64 Source row index into condition-1-en-5k. Enables row-level joins across conditions.
token_count int64 Token count computed using the CohereLabs/tiny-aya-base tokenizer.

Condition 3

Used by: condition-3-zh-5k

In Phase 3, Condition 3 ("Mixed Native Sources") refers to community-collected raw Chinese code from varied online public-source repositories — reflecting how non-English Python is actually used in real-world projects. The "Mixed Native Sources" name carries from Phase 2, where it originally referred to a planned composite (native code padded with cond-2 transpiled files); in Phase 3 the "mixed" refers to the diversity of source locations, not a cond-2/native composite. The physical dataset has not changed across phases.

The schema includes a source_type column from the Phase 2 composite design, which remains "native" or "transpiled" depending on each row's origin. code_en is populated for transpiled rows (keeping them in sync with conditions 1--2) but null for native code rows, which have no English equivalent.

Column Type Description
file_path string File identifier (native filename or transpiled file path)
code string The code content (native or transpiled)
code_en string/null English original -- populated for transpiled rows, null for native code rows
language string ISO 639-1 language code (zh)
license string Source license (SPDX identifier, UNKNOWN, or varies)
token_count int64 Token count computed using the CohereLabs/tiny-aya-base tokenizer
source_type string "native" (natively Chinese-authored) or "transpiled" (Legesher reserved-word translation of English)

Condition 4

Used by: condition-4-zh-5k

Condition 4 ("Community-Contributed Native Code") is intended to contain code whose problem-solving logic is itself native — written as if a native speaker were approaching the problem, not English code that was later translated. The current dataset reflects an earlier Phase 2 attempt to assemble this corpus; community contributions were insufficient for stable training, so cond-4 was not evaluated in either Phase 2 or Phase 3. Cond-5's fully-translated data served as Phase 3's practical proxy because gathering native-authored code at scale proved difficult. Direct contributions to the cond-4 corpus are open at the legesher/legesher-native-code HF Space.

Uses the same schema as the language-decoded-community dataset rather than the transpilation schema, since there is no English original to reference.

Column Type Description
filename string Original filename
content string The code content
extension string File extension (e.g., .py, .c, .wenyan)
source string Data source (e.g., thestack, wenyan, program_in_chinese)
quality_tier string Quality rating: A (highest) through D (lowest)
sha256 string SHA-256 hash for deduplication
byte_size int64 File size in bytes
total_lines int64 Total line count
cjk_ratio float64 Ratio of CJK characters in the file
has_cjk bool Whether the file contains CJK characters

Experimental Conditions

The Language Decoded experiment uses a ladder of conditions to isolate the mechanism behind code's reasoning benefit. For the full ladder including future directions, see legesher/language-decoded-experiments.

Condition Name Purpose
Baseline No fine-tuning Establishes the performance floor
Condition 1 English code Tests whether code fine-tuning helps at all (replicates Aryabumi et al.)
Condition 2 Reserved-Word Translation (Legesher) Tests whether translating Python's reserved words (keywords, exceptions, built-in functions, numerical system for some target languages) into the target language matters
Condition 3 Mixed Native Sources Tests whether code pulled from real-world public-source repositories (humans actually writing in the target language) adds value beyond Legesher's mechanical translation
Condition 4 Community-Contributed Native Code Tests whether code whose problem-solving logic is itself native (not translated from English) carries unique signal — pending sufficient community contributions
Condition 5 Synthesized Native Code Tests whether full translation (Legesher reserved words + c4ai-aya-expanse-32b full file translation) changes the picture relative to Condition 2's partial translation

The Experimental Ladder

  • Baseline → 1: Does code help at all?
  • 1 → 2: Does the language Python is written in matter? (Cond-2 translates Python's reserved words; user logic preserved.)
  • 2 → 3: Does code humans actually wrote in or with the target language add value beyond Legesher's mechanical translation?
  • 2 → 5: Cond-2 translates only Python's reserved words; cond-5 goes further by also translating identifiers, comments, docstrings, and string literals via c4ai-aya-expanse-32b. Logic preserved. Does full translation produce different effects than partial translation?
  • 3 → 5 (implicit): Human-authored vs. machine-synthesized native code.

Usage

from datasets import load_dataset

# Load full-size English code (control)
ds = load_dataset("legesher/language-decoded-data", "condition-1-en-103k")

# Load random 20k subsets
ds = load_dataset("legesher/language-decoded-data", "condition-1-en-20k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-zh-20k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-es-20k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-ur-20k")

# Load 5k subset (for QLoRA fine-tuning)
ds = load_dataset("legesher/language-decoded-data", "condition-1-en-5k")

# Load Legesher-transpiled variants (reserved-word translation)
ds = load_dataset("legesher/language-decoded-data", "condition-2-zh-5k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-es-5k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-ur-5k")

# Load blended native + transpiled (condition 3)
ds = load_dataset("legesher/language-decoded-data", "condition-3-zh-5k")

# Load strictly native code (condition 4)
ds = load_dataset("legesher/language-decoded-data", "condition-4-zh-5k")

# Load model-translated code (condition 5 -- raw, pre-cleanup)
ds = load_dataset("legesher/language-decoded-data", "condition-5-ur-5k-c4ai-aya-expanse-32b")
ds = load_dataset("legesher/language-decoded-data", "condition-5-zh-5k-c4ai-aya-expanse-32b")
ds = load_dataset("legesher/language-decoded-data", "condition-5-es-5k-c4ai-aya-expanse-32b")

# Access splits
train = ds["train"]
val = ds["validation"]

# Filter condition-3 by source type
native_only = train.filter(lambda x: x["source_type"] == "native")

Technical Details

Parameter Value
Source dataset (Phase 3, condition-*) bigcode/the-stack-v2-dedup (Python subset)
Source dataset (Phase 2, phase-2-the-stack-v1-*) bigcode/the-stack (The Stack v1)
Transpilation tool Legesher v0.7.3 (legesher-core, legesher-i18n)
Tokenizer CohereLabs/tiny-aya-base
Base model CohereLabs/tiny-aya-base (3.35B params)
Condition 5 translation model Cohere c4ai-aya-expanse-32b (Aya Expanse 32B, via the Cohere API)
Train/validation split 90% / 10% (seed 42)
File format Parquet (snappy compression)
Filtering criteria AST-valid, permissive licenses, 10--1000 lines, min 21 GitHub stars, no autogenerated files, SHA-256 deduplication

Limitations

  • Source bias: The Stack skews toward popular, well-starred GitHub repositories, which may not represent the full diversity of Python code in the wild.
  • Keyword-only transpilation: Legesher translates Python reserved words (keywords, builtins, exceptions) but leaves comments, docstrings, string literals, and variable/function names in their original language (typically English). This means condition-2 code is a hybrid of translated keywords and English identifiers.
  • Token count variation: Transpiled code may have different token counts than the English original due to multi-byte characters (especially for Chinese and Urdu), even though the code structure is identical.
  • Single programming language: Currently limited to Python. Results may not generalize to other programming languages.
  • Condition 4 not yet evaluated: Community contributions to the legesher/legesher-native-code HF Space have been insufficient for stable training. The existing condition-4-zh-5k data is a Phase 2 attempt limited to publicly available sources (The Stack, Wenyan, Program-in-Chinese, Qi, Mulan). Cond-5's fully-translated data served as the Phase 3 practical proxy for cond-4's "logic in the target language" goal.
  • Condition 5 is raw model output: The condition-5-* configs contain prompt-leakage contamination -- translator-model preamble text, JSON wrappers, and explanation commentary leaked into string literals and identifier names, in AST-valid and AST-invalid rows alike. Cleaned configs will be published separately. See the note at the top of this card.

Citation

@misc{language-decoded-2026,
  title={Language Decoded: Exploring the Impact of Native Code on Multilingual Models},
  author={Madison Edgar and Saad Ahmed Bazaz and Tom Sherborne and Rashik Shahjahan and Khojasteh Mirza and Sarah Jawaid and Rafay Mustafa and Sohaib Ahmed Bazaz},
  year={2026},
  publisher={Hugging Face},
  url={https://huggingface.co/datasets/legesher/language-decoded-data}
}

Links

License

Apache 2.0

Downloads last month
3,282

Collection including legesher/language-decoded-data

Paper for legesher/language-decoded-data