Datasets:

legesher
/

language-decoded-data

code stringlengths 72 8.78M	code_en stringlengths 72 8.78M	language stringclasses 1 value	file_path stringlengths 36 164	license stringclasses 1 value	token_count int64 26 8.41M
# -- coding: utf-8 -- from collections import OrderedDict from ipywidgets import Widget, Tab class ExtendedTab(Tab): """ A Tab subclass that allows to add/access/select/replace/remove children by name. There can be only one tab for any given name. Example: import time t = Extend...	# -- coding: utf-8 -- from collections import OrderedDict from ipywidgets import Widget, Tab class ExtendedTab(Tab): """ A Tab subclass that allows to add/access/select/replace/remove children by name. There can be only one tab for any given name. Example: import time t = Extend...	en	002440303_deeplook-ipyrest_extendedtab_093e2be98d4c.py	unknown	717
# ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # -------------------------------------------------------------------------...	# ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # -------------------------------------------------------------------------...	en	005599174_eslambakr-HRS_benchmark_ms_deform_attn_e958c034cd2c.py	unknown	2,162
""" Notice : 神兽保佑，测试一次通过 // // ┏┛ ┻━━━━━┛ ┻┓ // ┃　　　　　　 ┃ // ┃　　　━　　　┃ // ┃　┳┛　 ┗┳　┃ // ┃　　　　　　 ┃ // ┃　　　┻　　　┃ // ┃　　　　　　 ┃ // ┗━┓　　　┏━━━┛ // ┃　　　┃ Author: somewheve // ┃　　　┃ Datetime: 2019/7/3 下午8:46 ---> 无知即是罪恶 // ┃　　　┗━━━━━━━━━┓ // ┃　　　...	""" Notice : 神兽保佑，测试一次通过 // // ┏┛ ┻━━━━━┛ ┻┓ // ┃　　　　　　 ┃ // ┃　　　━　　　┃ // ┃　┳┛　 ┗┳　┃ // ┃　　　　　　 ┃ // ┃　　　┻　　　┃ // ┃　　　　　　 ┃ // ┗━┓　　　┏━━━┛ // ┃　　　┃ Author: somewheve // ┃　　　┃ Datetime: 2019/7/3 下午8:46 ---> 无知即是罪恶 // ┃　　　┗━━━━━━━━━┓ // ┃　　　...	en	004828635_ctpbee-ctpbee_local_position_cba89669d079.py	unknown	6,624
""" Reddit comments --------------- A collection of up to ~1.5 billion Reddit comments posted from October 2007 through May 2015. Records include the following key fields (plus a few others): - ``body``: Full text of the comment. - ``created_utc``: Date on which the comment was posted. - ``subreddit``: S...	""" Reddit comments --------------- A collection of up to ~1.5 billion Reddit comments posted from October 2007 through May 2015. Records include the following key fields (plus a few others): - ``body``: Full text of the comment. - ``created_utc``: Date on which the comment was posted. - ``subreddit``: S...	en	005639457_chartbeat-labs-textacy_reddit_comments_dd41d70f3bec.py	unknown	4,005
# Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from django.http import HttpRequest def source() -> str: request = HttpRequest() return request.GET["bad"] def sink(argument: st...	# Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from django.http import HttpRequest def source() -> str: request = HttpRequest() return request.GET["bad"] def sink(argument: st...	en	005513877_facebook-pyre-check_taint_9fd251ea2de2.py	unknown	98
"""Spatial Dissimilarity Index.""" __author__ = "Renan X. Cortes <renanc@ucr.edu>, Sergio J. Rey <sergio.rey@ucr.edu> and Elijah Knaap <elijah.knaap@ucr.edu>" import libpysal import numpy as np from libpysal.weights import Queen from .._base import SingleGroupIndex, SpatialExplicitIndex from .dissim import _dissim ...	"""Spatial Dissimilarity Index.""" __author__ = "Renan X. Cortes <renanc@ucr.edu>, Sergio J. Rey <sergio.rey@ucr.edu> and Elijah Knaap <elijah.knaap@ucr.edu>" import libpysal import numpy as np from libpysal.weights import Queen from .._base import SingleGroupIndex, SpatialExplicitIndex from .dissim import _dissim ...	en	005596770_pysal-segregation_spatial_dissim_c7cbba1b3cc7.py	unknown	1,408
# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to...	# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to...	en	003296220_mindspore-ai-models_pointTransfomrer_4aa2cdd24484.py	unknown	3,593
""" Conduct searches against all registry context. """ import math from operator import itemgetter from flask import abort from stringscore import liquidmetal from text_unidecode import unidecode import features from app import app, authentication, avatar from auth import scopes from auth.auth_context import get_aut...	""" Conduct searches against all registry context. """ import math from operator import itemgetter from flask import abort from stringscore import liquidmetal from text_unidecode import unidecode import features from app import app, authentication, avatar from auth import scopes from auth.auth_context import get_aut...	en	002995759_quay-quay_search_1eeecbd30b46.py	unknown	4,126
from pygments.lexer import RegexLexer, words, include from pygments.token import * from pygments.style import Style from pygments.token import Keyword, Name, Comment, String, Error, Number, Operator, Generic, Text from sphinxcontrib.domaintools import custom_domain import re __all__ = ['MinilangLexer'] class Minilang...	from pygments.lexer import RegexLexer, words, include from pygments.token import * from pygments.style import Style from pygments.token import Keyword, Name, Comment, String, Error, Number, Operator, Generic, Text from sphinxcontrib.domaintools import custom_domain import re __all__ = ['MinilangLexer'] class Minilang...	en	003598869_wrapl-minilang_minilang_5aa1f4fdd5b5.py	unknown	1,326
import os import numpy as np import yaml import re from edflow.data.dataset_mixin import DatasetMixin from edflow.util import retrieve, get_obj_from_str, pp2mkdtable, pop_keypath from edflow.util import walk, set_value, edprint from edflow.data.believers.meta_loaders import DEFAULT_LOADERS try: from IPython impor...	import os import numpy as np import yaml import re from edflow.data.dataset_mixin import DatasetMixin from edflow.util import retrieve, get_obj_from_str, pp2mkdtable, pop_keypath from edflow.util import walk, set_value, edprint from edflow.data.believers.meta_loaders import DEFAULT_LOADERS try: from IPython impor...	en	000899950_pesser-edflow_meta_09fbed6837d7.py	unknown	2,808
import logging import os from restic_compose_backup import utils logger = logging.getLogger(__name__) def run(image: str = None, command: str = None, volumes: dict = None, environment: dict = None, labels: dict = None, source_container_id: str = None): logger.info("Starting backup container") client...	import logging import os from restic_compose_backup import utils logger = logging.getLogger(__name__) def run(image: str = None, command: str = None, volumes: dict = None, environment: dict = None, labels: dict = None, source_container_id: str = None): logger.info("Starting backup container") client...	en	005414885_ZettaIO-restic-compose-backup_backup_runner_9c6cd421a308.py	unknown	517

End of preview. Expand in Data Studio

Language Decoded | Multilingual Code Dataset

Experiment and proposed paper title: Language Decoded: Exploring the Impact of Native Code on Multilingual Models

Note (2026-05-18): Current Phase 3 configs use the short condition-* namespace and include 103k, 20k, and 5k sizes for Conditions 1--2. Phase 2 configs remain available under the phase-2-the-stack-v1-* namespace for reproducibility.

Multilingual Python code datasets for the Language Decoded project (part of Cohere's Tiny Aya Expedition). See legesher/language-decoded-experiments for the canonical project description, the full experimental ladder, and the paper-grade evaluation results.

Research Question

How does fine-tuning on non-English code — whether transpiled, mixed-native, or fully translated — affect a model's multilingual reasoning, and how does that impact differ from fine-tuning on English code?

Prior work (Aryabumi et al., 2024 -- "To Code or Not to Code") demonstrated that including English code in pre-training data improves downstream reasoning performance by approximately 8%. However, that study only tested English code. This dataset enables the natural follow-up: how does the impact of non-English code differ from English code, and how does that vary by language, structure, and corpus construction?

Dataset Description

This dataset provides filtered, quality-controlled Python source code in multiple configurations: the original English (cond-1); three Legesher-transpiled variants (cond-2 zh/es/ur, with Python's reserved words translated to the target language); a community-collected raw native-source corpus (cond-3); strictly native code (cond-4, pending); and a model-translated set (cond-5, where c4ai-aya-expanse-32b translates everything translatable inside the file). Python source for Conditions 1, 2, and 5 is drawn from bigcode/the-stack-v2-dedup (Python subset) for the current Phase 3 configs; the legacy phase-2-the-stack-v1-* configs are sourced from The Stack v1 (non-dedup). Conditions 3 and 4 draw on natively-authored or community-contributed code (see those conditions below).

Source-file control

Cond-1, cond-2, and cond-5 all train on the same 5,000-file subset drawn from bigcode/the-stack-v2-dedup (with a parallel 20k subset for the 20k tier). Differences across these conditions reflect the processing pipeline (raw / transpiled / fully translated), not file-quality or content drift. Cond-3 is the deliberate exception — its source files are a different population by design (community-collected from varied online sources, potentially including non-Python files).

Source files for cond-1/2/5 are filtered using:

AST-valid Python only (must parse without errors)
Permissive licenses only (MIT, Apache-2.0, BSD, etc.)
10--1000 lines of code
Minimum 21 GitHub stars
No autogenerated files
SHA-256 deduplication

Cond-2 variants are produced using Legesher v0.7.3, which translates Python's reserved words (37 keywords, 72 built-in functions, 66 exceptions, plus the numerical system for some target languages) into the target language while preserving code structure and user logic. Cond-5 takes the Legesher-transpiled output and runs it through c4ai-aya-expanse-32b via the Cohere API to translate the remaining content — identifiers, comments, docstrings, string literals, and any other natural-language wording — into the target language. Logic and structure are preserved throughout.

Available Configs

Conditions 1--2 are available in three current Phase 3 sizes: -103k full corpora, -20k random subsets sampled from the corresponding -103k config with seed 42, and -5k compact subsets. Phase 2 -32k configs are still available with the phase-2-the-stack-v1-* prefix. Condition 5 (condition-5-*-c4ai-aya-expanse-32b) is the model-translated set — currently 5k only, and raw/pre-cleanup (see the note above).

Config	Condition	Language	Description	Train	Val
`condition-1-en-103k`	1 (control)	English	Unmodified filtered Python	93,549	10,395
`condition-1-en-20k`	1 (control)	English	Random 20k subset of `condition-1-en-103k`	18,000	2,000
`condition-1-en-5k`	1 (control)	English	Compact 5k subset	4,500	500
`condition-2-zh-103k`	2	Chinese	Legesher v0.7.3 reserved-word translation	93,547	10,395
`condition-2-zh-20k`	2	Chinese	Random 20k subset of `condition-2-zh-103k`	18,000	2,000
`condition-2-zh-5k`	2	Chinese	Compact 5k subset	4,500	500
`condition-2-es-103k`	2	Spanish	Legesher v0.7.3 reserved-word translation	93,547	10,395
`condition-2-es-20k`	2	Spanish	Random 20k subset of `condition-2-es-103k`	18,000	2,000
`condition-2-es-5k`	2	Spanish	Compact 5k subset	4,500	500
`condition-2-ur-103k`	2	Urdu	Legesher v0.7.3 reserved-word translation	93,547	10,395
`condition-2-ur-20k`	2	Urdu	Random 20k subset of `condition-2-ur-103k`	18,000	2,000
`condition-2-ur-5k`	2	Urdu	Compact 5k subset	4,500	500
`condition-3-zh-5k`	3	Chinese	Blended: native Chinese code + transpiled Python	4,500	500
`condition-4-zh-5k`	4	Chinese	Strictly native Chinese code	6,553	729
`condition-5-ur-5k-c4ai-aya-expanse-32b`	5	Urdu	Model-translated (full LLM translation via Cohere Aya) — raw, pre-cleanup	4,088	381
`condition-5-zh-5k-c4ai-aya-expanse-32b`	5	Chinese	Model-translated (full LLM translation via Cohere Aya) — raw, pre-cleanup	4,052	381
`condition-5-es-5k-c4ai-aya-expanse-32b`	5	Spanish	Model-translated (full LLM translation via Cohere Aya) — raw, pre-cleanup	4,032	329

Schema

Conditions 1--2

Used by: condition-1-en-*, condition-2-zh-*, condition-2-es-*, condition-2-ur-*

Column	Type	Description
`code`	string	Python source code. For condition-2 configs, this is the Legesher-transpiled (reserved-word translated) version. For condition-1, this is the original English source.
`code_en`	string	Original English Python source code. Identical to `code` for condition-1-en.
`language`	string	ISO 639-1 language code: `en`, `ur`, `zh`, or `es`.
`file_path`	string	Original file path in the source dataset.
`license`	string	SPDX license identifier for the source file.
`token_count`	int64	Token count computed using the CohereLabs/tiny-aya-base tokenizer.

Condition 5

Used by: condition-5-ur-5k-c4ai-aya-expanse-32b, condition-5-zh-5k-c4ai-aya-expanse-32b, condition-5-es-5k-c4ai-aya-expanse-32b

Condition 5 uses the conditions 1--2 schema plus an idx column. code is the full LLM-translated source (identifiers, strings, comments, and keywords); code_en is the English original. These configs are raw model output — see the note at the top of this card.

Column	Type	Description
`code`	string	Model-translated Python source (full LLM translation via Cohere Aya).
`code_en`	string	Original English Python source code.
`language`	string	ISO 639-1 language code: `ur`, `zh`, or `es`.
`file_path`	string	Original file path in the source dataset.
`license`	string	SPDX license identifier for the source file.
`idx`	int64	Source row index into `condition-1-en-5k`. Enables row-level joins across conditions.
`token_count`	int64	Token count computed using the CohereLabs/tiny-aya-base tokenizer.

Condition 3

Used by: condition-3-zh-5k

In Phase 3, Condition 3 ("Mixed Native Sources") refers to community-collected raw Chinese code from varied online public-source repositories — reflecting how non-English Python is actually used in real-world projects. The "Mixed Native Sources" name carries from Phase 2, where it originally referred to a planned composite (native code padded with cond-2 transpiled files); in Phase 3 the "mixed" refers to the diversity of source locations, not a cond-2/native composite. The physical dataset has not changed across phases.

The schema includes a source_type column from the Phase 2 composite design, which remains "native" or "transpiled" depending on each row's origin. code_en is populated for transpiled rows (keeping them in sync with conditions 1--2) but null for native code rows, which have no English equivalent.

Column	Type	Description
`file_path`	string	File identifier (native filename or transpiled file path)
`code`	string	The code content (native or transpiled)
`code_en`	string/null	English original -- populated for transpiled rows, null for native code rows
`language`	string	ISO 639-1 language code (`zh`)
`license`	string	Source license (SPDX identifier, `UNKNOWN`, or `varies`)
`token_count`	int64	Token count computed using the CohereLabs/tiny-aya-base tokenizer
`source_type`	string	`"native"` (natively Chinese-authored) or `"transpiled"` (Legesher reserved-word translation of English)

Condition 4

Used by: condition-4-zh-5k

Condition 4 ("Community-Contributed Native Code") is intended to contain code whose problem-solving logic is itself native — written as if a native speaker were approaching the problem, not English code that was later translated. The current dataset reflects an earlier Phase 2 attempt to assemble this corpus; community contributions were insufficient for stable training, so cond-4 was not evaluated in either Phase 2 or Phase 3. Cond-5's fully-translated data served as Phase 3's practical proxy because gathering native-authored code at scale proved difficult. Direct contributions to the cond-4 corpus are open at the legesher/legesher-native-code HF Space.

Uses the same schema as the language-decoded-community dataset rather than the transpilation schema, since there is no English original to reference.

Column	Type	Description
`filename`	string	Original filename
`content`	string	The code content
`extension`	string	File extension (e.g., `.py`, `.c`, `.wenyan`)
`source`	string	Data source (e.g., `thestack`, `wenyan`, `program_in_chinese`)
`quality_tier`	string	Quality rating: `A` (highest) through `D` (lowest)
`sha256`	string	SHA-256 hash for deduplication
`byte_size`	int64	File size in bytes
`total_lines`	int64	Total line count
`cjk_ratio`	float64	Ratio of CJK characters in the file
`has_cjk`	bool	Whether the file contains CJK characters

Experimental Conditions

The Language Decoded experiment uses a ladder of conditions to isolate the mechanism behind code's reasoning benefit. For the full ladder including future directions, see legesher/language-decoded-experiments.

Condition	Name	Purpose
Baseline	No fine-tuning	Establishes the performance floor
Condition 1	English code	Tests whether code fine-tuning helps at all (replicates Aryabumi et al.)
Condition 2	Reserved-Word Translation (Legesher)	Tests whether translating Python's reserved words (keywords, exceptions, built-in functions, numerical system for some target languages) into the target language matters
Condition 3	Mixed Native Sources	Tests whether code pulled from real-world public-source repositories (humans actually writing in the target language) adds value beyond Legesher's mechanical translation
Condition 4	Community-Contributed Native Code	Tests whether code whose problem-solving logic is itself native (not translated from English) carries unique signal — pending sufficient community contributions
Condition 5	Synthesized Native Code	Tests whether full translation (Legesher reserved words + `c4ai-aya-expanse-32b` full file translation) changes the picture relative to Condition 2's partial translation

The Experimental Ladder

Baseline → 1: Does code help at all?
1 → 2: Does the language Python is written in matter? (Cond-2 translates Python's reserved words; user logic preserved.)
2 → 3: Does code humans actually wrote in or with the target language add value beyond Legesher's mechanical translation?
2 → 5: Cond-2 translates only Python's reserved words; cond-5 goes further by also translating identifiers, comments, docstrings, and string literals via c4ai-aya-expanse-32b. Logic preserved. Does full translation produce different effects than partial translation?
3 → 5 (implicit): Human-authored vs. machine-synthesized native code.

Usage

from datasets import load_dataset

# Load full-size English code (control)
ds = load_dataset("legesher/language-decoded-data", "condition-1-en-103k")

# Load random 20k subsets
ds = load_dataset("legesher/language-decoded-data", "condition-1-en-20k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-zh-20k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-es-20k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-ur-20k")

# Load 5k subset (for QLoRA fine-tuning)
ds = load_dataset("legesher/language-decoded-data", "condition-1-en-5k")

# Load Legesher-transpiled variants (reserved-word translation)
ds = load_dataset("legesher/language-decoded-data", "condition-2-zh-5k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-es-5k")
ds = load_dataset("legesher/language-decoded-data", "condition-2-ur-5k")

# Load blended native + transpiled (condition 3)
ds = load_dataset("legesher/language-decoded-data", "condition-3-zh-5k")

# Load strictly native code (condition 4)
ds = load_dataset("legesher/language-decoded-data", "condition-4-zh-5k")

# Load model-translated code (condition 5 -- raw, pre-cleanup)
ds = load_dataset("legesher/language-decoded-data", "condition-5-ur-5k-c4ai-aya-expanse-32b")
ds = load_dataset("legesher/language-decoded-data", "condition-5-zh-5k-c4ai-aya-expanse-32b")
ds = load_dataset("legesher/language-decoded-data", "condition-5-es-5k-c4ai-aya-expanse-32b")

# Access splits
train = ds["train"]
val = ds["validation"]

# Filter condition-3 by source type
native_only = train.filter(lambda x: x["source_type"] == "native")

Technical Details

Parameter	Value
Source dataset (Phase 3, `condition-*`)	bigcode/the-stack-v2-dedup (Python subset)
Source dataset (Phase 2, `phase-2-the-stack-v1-*`)	bigcode/the-stack (The Stack v1)
Transpilation tool	Legesher v0.7.3 (legesher-core, legesher-i18n)
Tokenizer	CohereLabs/tiny-aya-base
Base model	CohereLabs/tiny-aya-base (3.35B params)
Condition 5 translation model	Cohere `c4ai-aya-expanse-32b` (Aya Expanse 32B, via the Cohere API)
Train/validation split	90% / 10% (seed 42)
File format	Parquet (snappy compression)
Filtering criteria	AST-valid, permissive licenses, 10--1000 lines, min 21 GitHub stars, no autogenerated files, SHA-256 deduplication

Limitations

Source bias: The Stack skews toward popular, well-starred GitHub repositories, which may not represent the full diversity of Python code in the wild.
Keyword-only transpilation: Legesher translates Python reserved words (keywords, builtins, exceptions) but leaves comments, docstrings, string literals, and variable/function names in their original language (typically English). This means condition-2 code is a hybrid of translated keywords and English identifiers.
Token count variation: Transpiled code may have different token counts than the English original due to multi-byte characters (especially for Chinese and Urdu), even though the code structure is identical.
Single programming language: Currently limited to Python. Results may not generalize to other programming languages.
Condition 4 not yet evaluated: Community contributions to the legesher/legesher-native-code HF Space have been insufficient for stable training. The existing condition-4-zh-5k data is a Phase 2 attempt limited to publicly available sources (The Stack, Wenyan, Program-in-Chinese, Qi, Mulan). Cond-5's fully-translated data served as the Phase 3 practical proxy for cond-4's "logic in the target language" goal.
Condition 5 is raw model output: The condition-5-* configs contain prompt-leakage contamination -- translator-model preamble text, JSON wrappers, and explanation commentary leaked into string literals and identifier names, in AST-valid and AST-invalid rows alike. Cleaned configs will be published separately. See the note at the top of this card.

Citation

@misc{language-decoded-2026,
  title={Language Decoded: Exploring the Impact of Native Code on Multilingual Models},
  author={Madison Edgar and Saad Ahmed Bazaz and Tom Sherborne and Rashik Shahjahan and Khojasteh Mirza and Sarah Jawaid and Rafay Mustafa and Sohaib Ahmed Bazaz},
  year={2026},
  publisher={Hugging Face},
  url={https://huggingface.co/datasets/legesher/language-decoded-data}
}