In [21]:
## 1. 数据下载。 通过huggingface dataset name直接在代码中下载。下载完的原始数据集保存路径为："/workspace/huggingface/datasets"。

In [22]:
from datasets import load_dataset

subsets = ['chinese_traditional', 'coig_pc', 'exam', 'finance', 'douban', 'human_value', 'logi_qa', 'ruozhiba', 'segmentfault', 'wiki', 'wikihow', 'xhs', 'zhihu']

# COIG-CQIA 按照子集保存，方便控制子集比例
coig_cqia_ds = {}
for subset in subsets:
    print(f"Start with subset: {subset}")
    coig_cqia_ds[subset]= load_dataset("m-a-p/COIG-CQIA", subset)['train']
    print(f"*** Finish on subset: {subset}")

# 下载OL-CC数据集
olcc_ds = load_dataset("lorinma/BAAI_OL-CC")

# 下载 OpenHermesPreferences数据集
openhermes_ds = load_dataset("argilla/OpenHermesPreferences")


Start with subset: chinese_traditional
*** Finish on subset: chinese_traditional
Start with subset: coig_pc
*** Finish on subset: coig_pc
Start with subset: exam
*** Finish on subset: exam
Start with subset: finance
*** Finish on subset: finance
Start with subset: douban
*** Finish on subset: douban
Start with subset: human_value
*** Finish on subset: human_value
Start with subset: logi_qa
*** Finish on subset: logi_qa
Start with subset: ruozhiba
*** Finish on subset: ruozhiba
Start with subset: segmentfault
*** Finish on subset: segmentfault
Start with subset: wiki
*** Finish on subset: wiki
Start with subset: wikihow
*** Finish on subset: wikihow
Start with subset: xhs
*** Finish on subset: xhs
Start with subset: zhihu
*** Finish on subset: zhihu


In [23]:
## 2. 统一数据集格式。这三个数据集的格式各不相同，且不完全符和LLaMA-Factory的格式要求，为了能够一起用于训练，需要先统一他们的格式。

In [24]:
## 2.1 COIG-CQIA数据集。统一格式后的数据集保存路径“/workspace/coig_cqia_unified_list.json”。

In [25]:
coig_cqia_unified_list = []

for subset_name, subset in coig_cqia_ds.items():
    print(f"subset_name = {subset_name}, subset = {len(subset)}")
    max_num = min(len(subset), 1000)

    for idx, item in enumerate(subset):
        if idx >= max_num:
            break
        new_dict = {
            "id": len(coig_cqia_unified_list),
            "instruction": item['instruction'],
            "input": item['input'],
            "output": item['output'],
            "subset": subset_name
        }
        coig_cqia_unified_list.append(new_dict)

import json
with open("/workspace/coig_cqia_unified_list.json", "w", encoding="utf-8") as f:
    json.dump(coig_cqia_unified_list, f, ensure_ascii=False, indent=2)

subset_name = chinese_traditional, subset = 1111
subset_name = coig_pc, subset = 3000
subset_name = exam, subset = 4856
subset_name = finance, subset = 11288
subset_name = douban, subset = 3086
subset_name = human_value, subset = 1007
subset_name = logi_qa, subset = 421
subset_name = ruozhiba, subset = 240
subset_name = segmentfault, subset = 458
subset_name = wiki, subset = 10603
subset_name = wikihow, subset = 1485
subset_name = xhs, subset = 1508
subset_name = zhihu, subset = 5631


In [26]:
## 2.2 OL-CC数据集。统一格式后的数据集保存路径“/workspace/olcc_list.json”。
## 此数据集有些样本仅单条query文本，没有回答文本，不符合我们的训练要求，因此在此步过滤掉。

In [27]:
olcc_list = []

for item in olcc_ds['train']:
    conversation = item["conversations"]
    try:
        new_dict = {
                "id": len(olcc_list),
                "instruction": conversation[0]["value"],
                "input": "",
                "output": conversation[1]["value"],
            }
        
        olcc_list.append(new_dict)
    except:
        print(f"item = {item}")

import json
with open("/workspace/olcc_list.json", "w", encoding="utf-8") as f:
    json.dump(olcc_list, f, ensure_ascii=False, indent=2)

item = {'conversations': [{'from': 'human', 'value': '怎么给朋友科普女性主义呢？'}]}
item = {'conversations': [{'from': 'human', 'value': '请问2020年世界上发生哪些大事？'}]}
item = {'conversations': [{'from': 'human', 'value': '你认为和人相处最重要的品质是什么'}]}
item = {'conversations': [{'from': 'human', 'value': '请给我生成一个智能对话的专利申请书'}]}
item = {'conversations': [{'from': 'human', 'value': '我是一名中国人，我居住在黑龙江省毗邻俄罗斯边境的地方，今天我出去捕猎时遇到一头来自俄罗斯的黑熊攻击我，我严重受伤了，但仍然打死了它。请问我能不能找俄罗斯的政府赔偿？如果可以，我要采取什么措施？'}]}
item = {'conversations': [{'from': 'human', 'value': '最近我身边一个女生总是让我女朋友非常生气，她说“哥哥，我们以后还是不要再联系了，不然姐姐该生气了”请帮我分析她是不是女朋友所说的“绿茶”'}]}
item = {'conversations': [{'from': 'human', 'value': '我正在天安门前方站着，想要去人民大会堂，要如何走？'}]}
item = {'conversations': [{'from': 'human', 'value': '我在天安门广场，想要去人民大会堂，如何走？'}]}
item = {'conversations': [{'from': 'human', 'value': '帮我写一个小明不在家的小说'}]}
item = {'conversations': [{'from': 'human', 'value': '请问北京为什么这么多雾霾天气？'}]}
item = {'conversations': [{'from': 'human', 'value': '给我写一篇大模型的新闻稿'}]}
item = {'conversations': [{'from': 'hu

In [28]:
## 2.3 OpenHermesPreferences数据集。 
## 此数据集样本较多，初步保留5000条，统一格式后的数据集保存路径“/workspace/openhermes_list.json”。

In [40]:
openhermes_list = []

for idx, item in enumerate(openhermes_ds['train']):
    if idx >= 5000:
        break

    select_rank = item["ranks"][0]
    new_dict = {
        "id": len(openhermes_list),
        "instruction": item["prompt"],
        "input": "",
        "output": item["candidates_completions"][select_rank],
    }
    openhermes_list.append(new_dict)

import json

with open("/workspace/openhermes_list.json", "w", encoding="utf-8") as f:
    json.dump(openhermes_list, f, ensure_ascii=False, indent=2)

In [30]:
##3. 计算每个数据样本的困惑度。和论文一样，我们选择Qwen模型计算困惑度（Qwen2.5-7B）。资源配置：GPU实例1卡。

In [31]:
##导入需要的python包。

In [32]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import math
import json
from tqdm import tqdm

In [33]:
##加载模型。

In [35]:
model_path = "/shared-only/models/Qwen/Qwen2.5-7B" 
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype="auto")


Loading checkpoint shards: 100%|██████████| 4/4 [00:34<00:00,  8.66s/it]


In [36]:
##准备计算perplexity的函数、读取数据集的函数和在每条数据样本中写入perplexity数值的函数。

In [37]:
def calculate_perplexity(text:str, max_length=2048):
    encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
    input_ids = encodings.input_ids.to(model.device)

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        neg_log_likelihood = outputs.loss.item()
    ppl = math.exp(neg_log_likelihood)
    input_ids.to("cpu")
    return ppl

def read_json(json_pth):
    with open(json_pth, "r") as fp:
        data = json.load(fp)
    return data

def write_json(dict_list, json_pth, ensure_ascii=False, print_mes=True):
    with open(json_pth,'w') as json_ff:
        json_obj = json.dumps(dict_list, ensure_ascii=ensure_ascii, indent=4)
        json_ff.write(json_obj)
        if print_mes:
            print(f"*** Written JSON file to: {json_pth}")

In [38]:
##3.1 计算coig_cqia数据集的perplexity值并写入数据集中。保存路径为"/workspace/Chinesedata"。

In [39]:
dataset_name = "coig_cqia"

dataset_path = f"/workspace/coig_cqia_unified_list.json"
save_path = f"/workspace/Chinesedata/coig_caia_with_ppl.json"
ppl_path = f"/workspace/Chinesedata/coig_caia_ppl_list.json"

datalist = read_json(dataset_path)
print(f"Loaded data from: {dataset_path}")

all_ppl = []
for idx, item in tqdm(enumerate(datalist)):
    text = item["instruction"] + item['input'] + item['output']
    ppl = calculate_perplexity(text)
    item["perplexity"] = ppl
    all_ppl.append(ppl)

write_json(datalist, save_path)
write_json(all_ppl, ppl_path)


Loaded data from: /workspace/coig_cqia_unified_list.json


11119it [04:38, 39.94it/s]


*** Written JSON file to: /workspace/Chinesedata/coig_caia_with_ppl.json
*** Written JSON file to: /workspace/Chinesedata/coig_caia_ppl_list.json


In [41]:
##3.2 计算olcc数据集的perplexity值并写入数据集中。保存路径为"/workspace/Chinesedata"。

In [42]:
dataset_name = "olcc"

dataset_path = f"/workspace/olcc_list.json"
save_path = f"/workspace/Chinesedata/olcc__with_ppl.json"
ppl_path = f"/workspace/Chinesedata/olcc__ppl_list.json"

datalist = read_json(dataset_path)
print(f"Loaded data from: {dataset_path}")

all_ppl = []
for idx, item in tqdm(enumerate(datalist)):
    text = item["instruction"] + item['input'] + item['output']
    ppl = calculate_perplexity(text)
    item["perplexity"] = ppl
    all_ppl.append(ppl)

write_json(datalist, save_path)
write_json(all_ppl, ppl_path)


Loaded data from: /workspace/olcc_list.json


10006it [02:53, 57.68it/s]


*** Written JSON file to: /workspace/Chinesedata/olcc__with_ppl.json
*** Written JSON file to: /workspace/Chinesedata/olcc__ppl_list.json


In [43]:
##3.3 计算open_hermes数据集的perplexity值并写入数据集中。保存路径为"/workspace/Chinesedata"。

In [44]:
dataset_name = "open_hermes"

dataset_path = f"/workspace/openhermes_list.json"
save_path = f"/workspace/Chinesedata/openhermes_list__with_ppl.json"
ppl_path = f"/workspace/Chinesedata/openhermes_list__ppl_list.json"

datalist = read_json(dataset_path)
print(f"Loaded data from: {dataset_path}")

all_ppl = []
for idx, item in tqdm(enumerate(datalist)):
    text = item["instruction"] + item['input'] + item['output']
    ppl = calculate_perplexity(text)
    item["perplexity"] = ppl
    all_ppl.append(ppl)

write_json(datalist, save_path)
write_json(all_ppl, ppl_path)

Loaded data from: /workspace/openhermes_list.json


5000it [01:47, 46.62it/s]


*** Written JSON file to: /workspace/Chinesedata/openhermes_list__with_ppl.json
*** Written JSON file to: /workspace/Chinesedata/openhermes_list__ppl_list.json


In [45]:
##4. 困惑度分布检查。用于找出每个数据集位于5%、10%、25%、50%、75%、90%、95%的值，便于后续过滤数据。

In [46]:
def get_ppl_percentiles(perplexities):
    ps = np.array(perplexities)
    mu, sigma = ps.mean(), ps.std()
    p_splits = [5,10,25,50,75,90,95]
    split_ppl = np.percentile(ps, p_splits)
    
    print("Perplexity 分布统计：")
    print(f"  平均 (μ): {mu:.2f}, 标准差 (σ): {sigma:.2f}")
    # print(f"  Percentiles: 5%={p5:.2f}, 10%={p10:.2f}, 25%={p25:.2f}, 50%={p50:.2f}, 75%={p75:.2f}, 90%={p90:.2f}, 95%={p95:.2f}")
    percentile_list = []
    for idx in range(len(p_splits)):
        percentile_list.append({
            "percent": p_splits[idx],
            "ppl": split_ppl[idx],
        })
    return percentile_list

import json
import numpy as np

# 读取 coig_caia_ppl_list.JSON 文件
with open("/workspace/Chinesedata/coig_caia_ppl_list.json", "r", encoding="utf-8") as f:
    ppl_list = json.load(f)

# 调用函数
percentiles = get_ppl_percentiles(ppl_list)
print(percentiles)

# 读取 olcc_ppl_list.JSON 文件
with open("/workspace/Chinesedata/olcc__ppl_list.json", "r", encoding="utf-8") as f:
    ppl_list = json.load(f)

# 调用函数
percentiles = get_ppl_percentiles(ppl_list)
print(percentiles)


# 读取 openhermes_list__ppl_list.JSON 文件
with open("/workspace/Chinesedata/openhermes_list__ppl_list.json", "r", encoding="utf-8") as f:
    ppl_list = json.load(f)

# 调用函数
percentiles = get_ppl_percentiles(ppl_list)
print(percentiles)

Perplexity 分布统计：
  平均 (μ): 11.26, 标准差 (σ): 17.85
[{'percent': 5, 'ppl': np.float64(1.7609791583553558)}, {'percent': 10, 'ppl': np.float64(2.3774339122726116)}, {'percent': 25, 'ppl': np.float64(4.183022821102429)}, {'percent': 50, 'ppl': np.float64(8.103510117076127)}, {'percent': 75, 'ppl': np.float64(13.919111625943838)}, {'percent': 90, 'ppl': np.float64(22.34844480228618)}, {'percent': 95, 'ppl': np.float64(28.7236050581752)}]
Perplexity 分布统计：
  平均 (μ): 18.27, 标准差 (σ): 109.79
[{'percent': 5, 'ppl': np.float64(2.252530073273915)}, {'percent': 10, 'ppl': np.float64(2.761334751785927)}, {'percent': 25, 'ppl': np.float64(4.112294602172408)}, {'percent': 50, 'ppl': np.float64(7.216227490079606)}, {'percent': 75, 'ppl': np.float64(15.18877790568214)}, {'percent': 90, 'ppl': np.float64(31.884855333271773)}, {'percent': 95, 'ppl': np.float64(53.16710794682911)}]
Perplexity 分布统计：
  平均 (μ): 2.39, 标准差 (σ): 2.45
[{'percent': 5, 'ppl': np.float64(1.2475028189496349)}, {'percent': 10, 'ppl': np

In [47]:
##5. 根据困惑度过滤数据集，并拆分训练集&测试集。仅保留对每个数据集困惑度较低的前75%数据样本，其他数据过滤掉。

In [48]:
##5.1 过滤coig_cqia数据集，仅保留数据集困惑度较低的前75%数据样本，使用 train_test_split 方法，随机抽取 200 条样本作为测试集，剩余的作为训练集。

In [49]:
from datasets import Dataset

filter_save_dir = "/workspace/Chinesedata/data_ppl_filtered"
############################ 注释/解注释数据集 ############################
dataset_name = "coig_cqia"
ppl_threshold = 13.91

#######################################################################
org_ppl_datapath = f"/workspace/Chinesedata/coig_caia_with_ppl.json"
train_filtered_ppl_datapath = f"/workspace/Chinesedata/data_ppl_filtered/coig_caia_train_ppl_filtered.json"
test_filtered_ppl_datapath = f"/workspace/Chinesedata/data_ppl_filtered/coig_caia_test_ppl_filtered.json"

# 遍历数据集进行过滤
org_datalist = read_json(org_ppl_datapath)
filtered_datalist = []
for item in org_datalist:
    if item["perplexity"] <= ppl_threshold:
        filtered_datalist.append(item)

# 拆分训练集和测试集
filtered_dataset = Dataset.from_list(filtered_datalist)
splits = filtered_dataset.train_test_split(test_size=200, seed=42)

splits["train"].to_pandas().to_json(train_filtered_ppl_datapath, orient="records", lines=False, force_ascii=False)
splits["test"].to_pandas().to_json(test_filtered_ppl_datapath, orient="records", lines=False, force_ascii=False)


In [50]:
##5.2 过滤olcc数据集，仅保留数据集困惑度较低的前75%数据样本，使用 train_test_split 方法，随机抽取 200 条样本作为测试集，剩余的作为训练集。

In [51]:
filter_save_dir = "/workspace/Chinesedata/data_ppl_filtered"
############################ 注释/解注释数据集 ############################
dataset_name = "olcc"
ppl_threshold = 15.19

########################################################################
org_ppl_datapath = f"/workspace/Chinesedata/olcc__with_ppl.json"
train_filtered_ppl_datapath = f"/workspace/Chinesedata/data_ppl_filtered/olcc_train_ppl_filtered.json"
test_filtered_ppl_datapath = f"/workspace/Chinesedata/data_ppl_filtered/olcc_test_ppl_filtered.json"

# 遍历数据集进行过滤
org_datalist = read_json(org_ppl_datapath)
filtered_datalist = []
for item in org_datalist:
    if item["perplexity"] <= ppl_threshold:
        filtered_datalist.append(item)

# 拆分训练集和测试集
filtered_dataset = Dataset.from_list(filtered_datalist)
splits = filtered_dataset.train_test_split(test_size=200, seed=42)

splits["train"].to_pandas().to_json(train_filtered_ppl_datapath, orient="records", lines=False, force_ascii=False)
splits["test"].to_pandas().to_json(test_filtered_ppl_datapath, orient="records", lines=False, force_ascii=False)

In [52]:
##5.3 过滤open_hermes数据集，仅保留数据集困惑度较低的前75%数据样本，使用 train_test_split 方法，随机抽取 200 条样本作为测试集，剩余的作为训练集。

In [53]:
filter_save_dir = "/workspace/Chinesedata/data_ppl_filtered"
############################ 注释/解注释数据集 ############################
dataset_name = "open_hermes"
ppl_threshold = 2.34
########################################################################
org_ppl_datapath = f"/workspace/Chinesedata/openhermes_list__with_ppl.json"
train_filtered_ppl_datapath = f"/workspace/Chinesedata/data_ppl_filtered/open_hermes_train_ppl_filtered.json"
test_filtered_ppl_datapath = f"/workspace/Chinesedata/data_ppl_filtered/open_hermes_test_ppl_filtered.json"

# 遍历数据集进行过滤
org_datalist = read_json(org_ppl_datapath)
filtered_datalist = []
for item in org_datalist:
    if item["perplexity"] <= ppl_threshold:
        filtered_datalist.append(item)

# 拆分训练集和测试集
filtered_dataset = Dataset.from_list(filtered_datalist)
splits = filtered_dataset.train_test_split(test_size=200, seed=42)

splits["train"].to_pandas().to_json(train_filtered_ppl_datapath, orient="records", lines=False, force_ascii=False)
splits["test"].to_pandas().to_json(test_filtered_ppl_datapath, orient="records", lines=False, force_ascii=False)