Library Usage
This page walks through using molprint as a Rust library, from parsing a single SMILES string to running a parallel similarity search over thousands of compounds.
Parsing SMILES
#![allow(unused)] fn main() { use molprint_core::smiles::parse_smiles; let mol = parse_smiles("CC(=O)Oc1ccccc1C(=O)O").unwrap(); // aspirin println!("{} atoms, {} bonds", mol.node_count(), mol.edge_count()); // 13 atoms, 13 bonds }
parse_smiles returns a Result<MolGraph, ParseError>. In library code, propagate the error rather than unwrapping.
Inspecting atoms
#![allow(unused)] fn main() { use molprint_core::smiles::parse_smiles; use molprint_core::mol::graph::MolGraphExt; use petgraph::graph::NodeIndex; let mol = parse_smiles("CCO").unwrap(); // ethanol for i in 0..mol.node_count() { let idx = NodeIndex::new(i); let atom = mol.atom(idx); println!( "atom {}: {:?} charge={} h_count={} in_ring={}", i, atom.element, atom.charge, atom.h_count, atom.in_ring ); } }
Computing fingerprints
Morgan (ECFP)
#![allow(unused)] fn main() { use molprint_core::smiles::parse_smiles; use molprint_fp::{morgan::Morgan, traits::Fingerprinter}; let mol = parse_smiles("c1ccccc1").unwrap(); // ECFP4 (radius=2, 2048 bits) let morgan = Morgan::new(2, 2048); let fp = morgan.fingerprint(&mol); println!("{} bits set out of 2048", fp.count_ones()); println!("hex: {}", fp.to_hex()); }
Morgan::new(radius, nbits) where:
radius = 1→ ECFP2radius = 2→ ECFP4 (default)radius = 3→ ECFP6
MACCS-166
#![allow(unused)] fn main() { use molprint_core::smiles::parse_smiles; use molprint_fp::{maccs::Maccs166, traits::Fingerprinter}; let mol = parse_smiles("c1ccccc1").unwrap(); let maccs = Maccs166::new(); let fp = maccs.fingerprint(&mol); println!("{} bits set out of 166", fp.count_ones()); }
MACCS-166 always produces 167-bit fingerprints (bit 0 is always 0; bits 1–166 correspond to the 166 structural keys).
The Fingerprinter trait
Both Morgan and Maccs166 implement Fingerprinter:
#![allow(unused)] fn main() { pub trait Fingerprinter { fn fingerprint(&self, mol: &MolGraph) -> FingerprintBits; fn name(&self) -> &str; fn nbits(&self) -> usize; } }
You can use Box<dyn Fingerprinter> to select the algorithm at runtime.
Computing similarity
#![allow(unused)] fn main() { use molprint_core::smiles::parse_smiles; use molprint_fp::{morgan::Morgan, traits::Fingerprinter}; use molprint_search::metrics::{tanimoto, dice, cosine}; let benzene = parse_smiles("c1ccccc1").unwrap(); let pyridine = parse_smiles("c1ccncc1").unwrap(); let morgan = Morgan::new(2, 2048); let fp_a = morgan.fingerprint(&benzene); let fp_b = morgan.fingerprint(&pyridine); println!("Tanimoto: {:.3}", tanimoto(&fp_a, &fp_b)); println!("Dice: {:.3}", dice(&fp_a, &fp_b)); println!("Cosine: {:.3}", cosine(&fp_a, &fp_b)); }
All three metrics return f64 in [0.0, 1.0]. Tanimoto is the standard in cheminformatics.
Parallel screening
#![allow(unused)] fn main() { use molprint_core::smiles::parse_smiles; use molprint_fp::{morgan::Morgan, traits::Fingerprinter}; use molprint_search::screen::{threshold_search, top_k_search}; let morgan = Morgan::new(2, 2048); // Build a database (in practice, loaded from an FPS file) let smiles_list = vec!["CCO", "CCCO", "c1ccccc1", "CC(=O)O", "c1ccncc1"]; let db_fps: Vec<_> = smiles_list.iter() .map(|smi| morgan.fingerprint(&parse_smiles(smi).unwrap())) .collect(); // Query let query = parse_smiles("c1ccccc1").unwrap(); let query_fp = morgan.fingerprint(&query); // Threshold search: all compounds with Tanimoto >= 0.5 let hits = threshold_search(&query_fp, &db_fps, 0.5); for hit in &hits { println!("index={} similarity={:.3}", hit.index, hit.similarity); } // Top-k search: 3 most similar let top3 = top_k_search(&query_fp, &db_fps, 3); }
Both functions use Rayon for parallelism. They return Vec<SearchHit> sorted by descending similarity.
Reading and writing FPS files
#![allow(unused)] fn main() { use std::fs::File; use std::io::BufWriter; use molprint_io::fps::{FpsWriter, FpsReader}; // Write let out = File::create("output.fps").unwrap(); let mut writer = FpsWriter::new(BufWriter::new(out), 2048, "Morgan").unwrap(); writer.write_fingerprint("mol1", &fp).unwrap(); // Read let f = File::open("output.fps").unwrap(); let mut reader = FpsReader::new(f); let nbits = reader.read_header().unwrap(); for item in &mut reader { let (id, fp) = item.unwrap(); println!("{}: {} bits set", id, fp.count_ones()); } }
Reading SMILES files
#![allow(unused)] fn main() { use std::fs::File; use molprint_io::smiles_file::SmilesFileReader; let f = File::open("molecules.smi").unwrap(); for (id, mol) in SmilesFileReader::new(f) { println!("{}: {} atoms", id, mol.node_count()); } }
Reading SDF files
#![allow(unused)] fn main() { use molprint_io::sdf::open_sdf; for result in open_sdf("molecules.sdf").unwrap() { let record = result.unwrap(); println!("{}: {} atoms", record.name, record.mol.node_count()); // record.properties: HashMap<String, String> } }
open_sdf automatically handles .sdf.gz based on the file extension.
SMARTS substructure matching
#![allow(unused)] fn main() { use molprint_core::smarts; use molprint_core::smiles::parse_smiles; let carbonyl = smarts::compile("C=O").unwrap(); let aspirin = parse_smiles("CC(=O)Oc1ccccc1C(=O)O").unwrap(); if smarts::has_match(&aspirin, &carbonyl) { println!("aspirin contains a carbonyl group"); } // Count all matches (returns one mapping per unique anchor atom) let matches = smarts::subgraph_match(&aspirin, &carbonyl); println!("{} carbonyl groups found", matches.len()); }