使用 ESMFold 预测 3D 结构并保存为 PDB 文件

预测单个序列

from transformers import AutoTokenizer, EsmForProteinFolding

model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
seq = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"
output = model.infer_pdb(seq)
with open("my_peptide.pdb", "w") as f:
    f.write(output)

1
2
3
4
5
6
7

通过 .fasta 文件预测

from transformers import EsmForProteinFolding
import os
import torch
import argparse
from Bio import SeqIO

def load_model(use_gpu=True):
    """加载ESMFold模型"""
    print("Loading ESMFold model...")
    
    # 检查GPU可用性
    if use_gpu and torch.cuda.is_available():
        device = "cuda"
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        device = "cpu"
        print("Using CPU")
    
    model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
    model = model.to(device)
    
    print("Model loaded successfully!")
    return model, device

def predict_and_save(model, sequence_id, sequence, output_dir):
    """预测单个序列并保存PDB文件"""
    try:
        print(f"Predicting structure for {sequence_id}...")
        output = model.infer_pdb(sequence)
        
        # 清理序列ID作为文件名
        clean_id = sequence_id.replace('|', '_').replace('/', '_').replace('\\', '_').replace(':', '_')
        pdb_filename = f"{clean_id}.pdb"
        pdb_path = os.path.join(output_dir, pdb_filename)
        
        with open(pdb_path, "w") as f:
            f.write(output)
        
        print(f"Saved: {pdb_path}")
        return True
    except Exception as e:
        print(f"Error predicting {sequence_id}: {str(e)}")
        return False

def main():
    fasta_file = "/raid1/xz/Protein/esmfold_test/PLD_Streptomyces_twoHKD_D_only.fasta"
    output_dir = "/raid1/xz/Protein/esmfold_test/output"
    
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    
    # 加载模型
    model, _ = load_model()
    
    # 统计信息
    total_sequences = 0
    successful_predictions = 0
    
    # 读取FASTA文件并逐个预测
    print(f"Reading FASTA file: {fasta_file}")
    
    for record in SeqIO.parse(fasta_file, "fasta"):
        total_sequences += 1
        sequence_id = record.id
        sequence = str(record.seq)
        
        print(f"\nProcessing sequence {total_sequences}: {sequence_id}")
        print(f"Sequence length: {len(sequence)}")
        
        if predict_and_save(model, sequence_id, sequence, output_dir):
            successful_predictions += 1
    
    print(f"\n=== Prediction Summary ===")
    print(f"Total sequences: {total_sequences}")
    print(f"Successful predictions: {successful_predictions}")
    print(f"Failed predictions: {total_sequences - successful_predictions}")
    print(f"Output directory: {output_dir}")

if __name__ == "__main__":
    main()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

#合成生物学

上次更新: 2025/09/18, 20:06:50

← GraphEC Rosetta介绍→