
1662 lines
46 KiB
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

* Copyright 2017 Dgraph Labs, Inc. and Contributors
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package badger
import (
// Values have their first byte being byteData or byteDelete. This helps us distinguish between
// a key that has never been seen and a key that has been explicitly deleted.
const (
bitDelete byte = 1 << 0 // Set if the key has been deleted.
bitValuePointer byte = 1 << 1 // Set if the value is NOT stored directly next to key.
bitDiscardEarlierVersions byte = 1 << 2 // Set if earlier versions can be discarded.
// Set if item shouldn't be discarded via compactions (used by merge operator)
bitMergeEntry byte = 1 << 3
// The MSB 2 bits are for transactions.
bitTxn byte = 1 << 6 // Set if the entry is part of a txn.
bitFinTxn byte = 1 << 7 // Set if the entry is to indicate end of txn in value log.
mi int64 = 1 << 20
// The number of updates after which discard map should be flushed into badger.
discardStatsFlushThreshold = 100
type logFile struct {
path string
// This is a lock on the log file. It guards the fds value, the files
// existence and the files memory map.
// Use shared ownership when reading/writing the file or memory map, use
// exclusive ownership to open/close the descriptor, unmap or remove the file.
lock sync.RWMutex
fd *os.File
fid uint32
fmap []byte
size uint32
loadingMode options.FileLoadingMode
func (lf *logFile) mmap(size int64) (err error) {
if lf.loadingMode != options.MemoryMap {
// Nothing to do
return nil
lf.fmap, err = y.Mmap(lf.fd, false, size)
if err == nil {
err = y.Madvise(lf.fmap, false) // Disable readahead
return err
func (lf *logFile) munmap() (err error) {
if lf.loadingMode != options.MemoryMap || len(lf.fmap) == 0 {
// Nothing to do
return nil
if err := y.Munmap(lf.fmap); err != nil {
return errors.Wrapf(err, "Unable to munmap value log: %q", lf.path)
// This is important. We should set the map to nil because ummap
// system call doesn't change the length or capacity of the fmap slice.
lf.fmap = nil
return nil
// Acquire lock on mmap/file if you are calling this
func (lf *logFile) read(p valuePointer, s *y.Slice) (buf []byte, err error) {
var nbr int64
offset := p.Offset
if lf.loadingMode == options.FileIO {
buf = s.Resize(int(p.Len))
var n int
n, err = lf.fd.ReadAt(buf, int64(offset))
nbr = int64(n)
} else {
// Do not convert size to uint32, because the lf.fmap can be of size
// 4GB, which overflows the uint32 during conversion to make the size 0,
// causing the read to fail with ErrEOF. See issue #585.
size := int64(len(lf.fmap))
valsz := p.Len
lfsz := atomic.LoadUint32(&lf.size)
if int64(offset) >= size || int64(offset+valsz) > size ||
// Ensure that the read is within the file's actual size. It might be possible that
// the offset+valsz length is beyond the file's actual size. This could happen when
// dropAll and iterations are running simultaneously.
int64(offset+valsz) > int64(lfsz) {
err = y.ErrEOF
} else {
buf = lf.fmap[offset : offset+valsz]
nbr = int64(valsz)
return buf, err
func (lf *logFile) doneWriting(offset uint32) error {
// Sync before acquiring lock. (We call this from write() and thus know we have shared access
// to the fd.)
if err := y.FileSync(lf.fd); err != nil {
return errors.Wrapf(err, "Unable to sync value log: %q", lf.path)
// Before we were acquiring a lock here on lf.lock, because we were invalidating the file
// descriptor due to reopening it as read-only. Now, we don't invalidate the fd, but unmap it,
// truncate it and remap it. That creates a window where we have segfaults because the mmap is
// no longer valid, while someone might be reading it. Therefore, we need a lock here again.
defer lf.lock.Unlock()
// Unmap file before we truncate it. Windows cannot truncate a file that is mmapped.
if err := lf.munmap(); err != nil {
return errors.Wrapf(err, "failed to munmap vlog file %s", lf.fd.Name())
// TODO: Confirm if we need to run a file sync after truncation.
// Truncation must run after unmapping, otherwise Windows would crap itself.
if err := lf.fd.Truncate(int64(offset)); err != nil {
return errors.Wrapf(err, "Unable to truncate file: %q", lf.path)
fstat, err := lf.fd.Stat()
if err != nil {
return errors.Wrapf(err, "Unable to check stat for %q", lf.path)
sz := fstat.Size()
if sz == 0 {
// File is empty. We don't need to mmap it. Return.
return nil
y.AssertTrue(sz <= math.MaxUint32)
lf.size = uint32(sz)
if err = lf.mmap(sz); err != nil {
_ = lf.fd.Close()
return errors.Wrapf(err, "Unable to map file: %q", fstat.Name())
// Previously we used to close the file after it was written and reopen it in read-only mode.
// We no longer open files in read-only mode. We keep all vlog files open in read-write mode.
return nil
// You must hold lf.lock to sync()
func (lf *logFile) sync() error {
return y.FileSync(lf.fd)
var errStop = errors.New("Stop iteration")
var errTruncate = errors.New("Do truncate")
var errDeleteVlogFile = errors.New("Delete vlog file")
type logEntry func(e Entry, vp valuePointer) error
type safeRead struct {
k []byte
v []byte
recordOffset uint32
func (r *safeRead) Entry(reader *bufio.Reader) (*Entry, error) {
var hbuf [headerBufSize]byte
var err error
hash := crc32.New(y.CastagnoliCrcTable)
tee := io.TeeReader(reader, hash)
if _, err = io.ReadFull(tee, hbuf[:]); err != nil {
return nil, err
var h header
if h.klen > uint32(1<<16) { // Key length must be below uint16.
return nil, errTruncate
kl := int(h.klen)
if cap(r.k) < kl {
r.k = make([]byte, 2*kl)
vl := int(h.vlen)
if cap(r.v) < vl {
r.v = make([]byte, 2*vl)
e := &Entry{}
e.offset = r.recordOffset
e.Key = r.k[:kl]
e.Value = r.v[:vl]
if _, err = io.ReadFull(tee, e.Key); err != nil {
if err == io.EOF {
err = errTruncate
return nil, err
if _, err = io.ReadFull(tee, e.Value); err != nil {
if err == io.EOF {
err = errTruncate
return nil, err
var crcBuf [4]byte
if _, err = io.ReadFull(reader, crcBuf[:]); err != nil {
if err == io.EOF {
err = errTruncate
return nil, err
crc := binary.BigEndian.Uint32(crcBuf[:])
if crc != hash.Sum32() {
return nil, errTruncate
e.meta = h.meta
e.UserMeta = h.userMeta
e.ExpiresAt = h.expiresAt
return e, nil
// iterate iterates over log file. It doesn't not allocate new memory for every kv pair.
// Therefore, the kv pair is only valid for the duration of fn call.
func (vlog *valueLog) iterate(lf *logFile, offset uint32, fn logEntry) (uint32, error) {
fi, err := lf.fd.Stat()
if err != nil {
return 0, err
if int64(offset) == fi.Size() {
// We're at the end of the file already. No need to do anything.
return offset, nil
if vlog.opt.ReadOnly {
// We're not at the end of the file. We'd need to replay the entries, or
// possibly truncate the file.
return 0, ErrReplayNeeded
if int64(offset) > fi.Size() {
// Return 0 which would truncate the entire file. This was the original behavior before
// commit 7539f0a:Fix windows dataloss issue (#1134) was merged.
return 0, nil
// We're not at the end of the file. Let's Seek to the offset and start reading.
if _, err := lf.fd.Seek(int64(offset), io.SeekStart); err != nil {
return 0, errFile(err, lf.path, "Unable to seek")
reader := bufio.NewReader(lf.fd)
read := &safeRead{
k: make([]byte, 10),
v: make([]byte, 10),
recordOffset: offset,
var lastCommit uint64
var validEndOffset uint32 = offset
for {
e, err := read.Entry(reader)
if err == io.EOF {
} else if err == io.ErrUnexpectedEOF || err == errTruncate {
} else if err != nil {
return 0, err
} else if e == nil {
var vp valuePointer
vp.Len = uint32(headerBufSize + len(e.Key) + len(e.Value) + crc32.Size)
read.recordOffset += vp.Len
vp.Offset = e.offset
vp.Fid = lf.fid
if e.meta&bitTxn > 0 {
txnTs := y.ParseTs(e.Key)
if lastCommit == 0 {
lastCommit = txnTs
if lastCommit != txnTs {
} else if e.meta&bitFinTxn > 0 {
txnTs, err := strconv.ParseUint(string(e.Value), 10, 64)
if err != nil || lastCommit != txnTs {
// Got the end of txn. Now we can store them.
lastCommit = 0
validEndOffset = read.recordOffset
} else {
if lastCommit != 0 {
// This is most likely an entry which was moved as part of GC.
// We shouldn't get this entry in the middle of a transaction.
validEndOffset = read.recordOffset
if err := fn(*e, vp); err != nil {
if err == errStop {
return 0, errFile(err, lf.path, "Iteration function")
return validEndOffset, nil
func (vlog *valueLog) rewrite(f *logFile, tr trace.Trace) error {
maxFid := atomic.LoadUint32(&vlog.maxFid)
y.AssertTruef(uint32(f.fid) < maxFid, "fid to move: %d. Current max fid: %d", f.fid, maxFid)
tr.LazyPrintf("Rewriting fid: %d", f.fid)
wb := make([]*Entry, 0, 1000)
var size int64
y.AssertTrue(vlog.db != nil)
var count, moved int
fe := func(e Entry) error {
if count%100000 == 0 {
tr.LazyPrintf("Processing entry %d", count)
vs, err := vlog.db.get(e.Key)
if err != nil {
return err
if discardEntry(e, vs, vlog.db) {
return nil
// Value is still present in value log.
if len(vs.Value) == 0 {
return errors.Errorf("Empty value: %+v", vs)
var vp valuePointer
// If the entry found from the LSM Tree points to a newer vlog file, don't do anything.
if vp.Fid > f.fid {
return nil
// If the entry found from the LSM Tree points to an offset greater than the one
// read from vlog, don't do anything.
if vp.Offset > e.offset {
return nil
// If the entry read from LSM Tree and vlog file point to the same vlog file and offset,
// insert them back into the DB.
// NOTE: It might be possible that the entry read from the LSM Tree points to
// an older vlog file. See the comments in the else part.
if vp.Fid == f.fid && vp.Offset == e.offset {
// This new entry only contains the key, and a pointer to the value.
ne := new(Entry)
ne.meta = 0 // Remove all bits. Different keyspace doesn't need these bits.
ne.UserMeta = e.UserMeta
ne.ExpiresAt = e.ExpiresAt
// Create a new key in a separate keyspace, prefixed by moveKey. We are not
// allowed to rewrite an older version of key in the LSM tree, because then this older
// version would be at the top of the LSM tree. To work correctly, reads expect the
// latest versions to be at the top, and the older versions at the bottom.
if bytes.HasPrefix(e.Key, badgerMove) {
ne.Key = append([]byte{}, e.Key...)
} else {
ne.Key = make([]byte, len(badgerMove)+len(e.Key))
n := copy(ne.Key, badgerMove)
copy(ne.Key[n:], e.Key)
ne.Value = append([]byte{}, e.Value...)
es := int64(ne.estimateSize(vlog.opt.ValueThreshold))
// Consider size of value as well while considering the total size
// of the batch. There have been reports of high memory usage in
// rewrite because we don't consider the value size. See #1292.
es += int64(len(e.Value))
// Ensure length and size of wb is within transaction limits.
if int64(len(wb)+1) >= vlog.opt.maxBatchCount ||
size+es >= vlog.opt.maxBatchSize {
tr.LazyPrintf("request has %d entries, size %d", len(wb), size)
if err := vlog.db.batchSet(wb); err != nil {
return err
size = 0
wb = wb[:0]
wb = append(wb, ne)
size += es
} else {
// It might be possible that the entry read from LSM Tree points to an older vlog file.
// This can happen in the following situation. Assume DB is opened with
// numberOfVersionsToKeep=1
// Now, if we have ONLY one key in the system "FOO" which has been updated 3 times and
// the same key has been garbage collected 3 times, we'll have 3 versions of the movekey
// for the same key "FOO".
// NOTE: moveKeyi is the moveKey with version i
// Assume we have 3 move keys in L0.
// - moveKey1 (points to vlog file 10),
// - moveKey2 (points to vlog file 14) and
// - moveKey3 (points to vlog file 15).
// Also, assume there is another move key "moveKey1" (points to vlog file 6) (this is
// also a move Key for key "FOO" ) on upper levels (let's say 3). The move key
// "moveKey1" on level 0 was inserted because vlog file 6 was GCed.
// Here's what the arrangement looks like
// L0 => (moveKey1 => vlog10), (moveKey2 => vlog14), (moveKey3 => vlog15)
// L1 => ....
// L2 => ....
// L3 => (moveKey1 => vlog6)
// When L0 compaction runs, it keeps only moveKey3 because the number of versions
// to keep is set to 1. (we've dropped moveKey1's latest version)
// The new arrangement of keys is
// L0 => ....
// L1 => (moveKey3 => vlog15)
// L2 => ....
// L3 => (moveKey1 => vlog6)
// Now if we try to GC vlog file 10, the entry read from vlog file will point to vlog10
// but the entry read from LSM Tree will point to vlog6. The move key read from LSM tree
// will point to vlog6 because we've asked for version 1 of the move key.
// This might seem like an issue but it's not really an issue because the user has set
// the number of versions to keep to 1 and the latest version of moveKey points to the
// correct vlog file and offset. The stale move key on L3 will be eventually dropped by
// compaction because there is a newer versions in the upper levels.
return nil
_, err := vlog.iterate(f, 0, func(e Entry, vp valuePointer) error {
return fe(e)
if err != nil {
return err
tr.LazyPrintf("request has %d entries, size %d", len(wb), size)
batchSize := 1024
var loops int
for i := 0; i < len(wb); {
if batchSize == 0 {
vlog.db.opt.Warningf("We shouldn't reach batch size of zero.")
return ErrNoRewrite
end := i + batchSize
if end > len(wb) {
end = len(wb)
if err := vlog.db.batchSet(wb[i:end]); err != nil {
if err == ErrTxnTooBig {
// Decrease the batch size to half.
batchSize = batchSize / 2
tr.LazyPrintf("Dropped batch size to %d", batchSize)
return err
i += batchSize
tr.LazyPrintf("Processed %d entries in %d loops", len(wb), loops)
tr.LazyPrintf("Total entries: %d. Moved: %d", count, moved)
tr.LazyPrintf("Removing fid: %d", f.fid)
var deleteFileNow bool
// Entries written to LSM. Remove the older file now.
// Just a sanity-check.
if _, ok := vlog.filesMap[f.fid]; !ok {
return errors.Errorf("Unable to find fid: %d", f.fid)
if vlog.iteratorCount() == 0 {
delete(vlog.filesMap, f.fid)
deleteFileNow = true
} else {
vlog.filesToBeDeleted = append(vlog.filesToBeDeleted, f.fid)
if deleteFileNow {
if err := vlog.deleteLogFile(f); err != nil {
return err
return nil
func (vlog *valueLog) deleteMoveKeysFor(fid uint32, tr trace.Trace) error {
db := vlog.db
var result []*Entry
var count, pointers uint64
tr.LazyPrintf("Iterating over move keys to find invalids for fid: %d", fid)
err := db.View(func(txn *Txn) error {
opt := DefaultIteratorOptions
opt.InternalAccess = true
opt.PrefetchValues = false
itr := txn.NewIterator(opt)
defer itr.Close()
for itr.Seek(badgerMove); itr.ValidForPrefix(badgerMove); itr.Next() {
item := itr.Item()
if item.meta&bitValuePointer == 0 {
var vp valuePointer
if vp.Fid == fid {
e := &Entry{Key: y.KeyWithTs(item.Key(), item.Version()), meta: bitDelete}
result = append(result, e)
return nil
if err != nil {
tr.LazyPrintf("Got error while iterating move keys: %v", err)
return err
tr.LazyPrintf("Num total move keys: %d. Num pointers: %d", count, pointers)
tr.LazyPrintf("Number of invalid move keys found: %d", len(result))
batchSize := 10240
for i := 0; i < len(result); {
end := i + batchSize
if end > len(result) {
end = len(result)
if err := db.batchSet(result[i:end]); err != nil {
if err == ErrTxnTooBig {
batchSize /= 2
tr.LazyPrintf("Dropped batch size to %d", batchSize)
tr.LazyPrintf("Error while doing batchSet: %v", err)
return err
i += batchSize
tr.LazyPrintf("Move keys deletion done.")
return nil
func (vlog *valueLog) incrIteratorCount() {
atomic.AddInt32(&vlog.numActiveIterators, 1)
func (vlog *valueLog) iteratorCount() int {
return int(atomic.LoadInt32(&vlog.numActiveIterators))
func (vlog *valueLog) decrIteratorCount() error {
num := atomic.AddInt32(&vlog.numActiveIterators, -1)
if num != 0 {
return nil
lfs := make([]*logFile, 0, len(vlog.filesToBeDeleted))
for _, id := range vlog.filesToBeDeleted {
lfs = append(lfs, vlog.filesMap[id])
delete(vlog.filesMap, id)
vlog.filesToBeDeleted = nil
for _, lf := range lfs {
if err := vlog.deleteLogFile(lf); err != nil {
return err
return nil
func (vlog *valueLog) deleteLogFile(lf *logFile) error {
if lf == nil {
return nil
defer lf.lock.Unlock()
path := vlog.fpath(lf.fid)
if err := lf.munmap(); err != nil {
_ = lf.fd.Close()
return err
lf.fmap = nil
if err := lf.fd.Close(); err != nil {
return err
return os.Remove(path)
func (vlog *valueLog) dropAll() (int, error) {
// We don't want to block dropAll on any pending transactions. So, don't worry about iterator
// count.
var count int
deleteAll := func() error {
defer vlog.filesLock.Unlock()
for _, lf := range vlog.filesMap {
if err := vlog.deleteLogFile(lf); err != nil {
return err
vlog.filesMap = make(map[uint32]*logFile)
return nil
if err := deleteAll(); err != nil {
return count, err
vlog.db.opt.Infof("Value logs deleted. Creating value log file: 0")
if _, err := vlog.createVlogFile(0); err != nil {
return count, err
atomic.StoreUint32(&vlog.maxFid, 0)
return count, nil
// lfDiscardStats keeps track of the amount of data that could be discarded for
// a given logfile.
type lfDiscardStats struct {
m map[uint32]int64
flushChan chan map[uint32]int64
closer *y.Closer
updatesSinceFlush int
type valueLog struct {
dirPath string
elog trace.EventLog
// guards our view of which files exist, which to be deleted, how many active iterators
filesLock sync.RWMutex
filesMap map[uint32]*logFile
filesToBeDeleted []uint32
// A refcount of iterators -- when this hits zero, we can delete the filesToBeDeleted.
numActiveIterators int32
db *DB
maxFid uint32 // accessed via atomics.
writableLogOffset uint32 // read by read, written by write. Must access via atomics.
numEntriesWritten uint32
opt Options
garbageCh chan struct{}
lfDiscardStats *lfDiscardStats
func vlogFilePath(dirPath string, fid uint32) string {
return fmt.Sprintf("%s%s%06d.vlog", dirPath, string(os.PathSeparator), fid)
func (vlog *valueLog) fpath(fid uint32) string {
return vlogFilePath(vlog.dirPath, fid)
func (vlog *valueLog) populateFilesMap() error {
vlog.filesMap = make(map[uint32]*logFile)
files, err := ioutil.ReadDir(vlog.dirPath)
if err != nil {
return errFile(err, vlog.dirPath, "Unable to open log dir.")
found := make(map[uint64]struct{})
for _, file := range files {
if !strings.HasSuffix(file.Name(), ".vlog") {
fsz := len(file.Name())
fid, err := strconv.ParseUint(file.Name()[:fsz-5], 10, 32)
if err != nil {
return errFile(err, file.Name(), "Unable to parse log id.")
if _, ok := found[fid]; ok {
return errFile(err, file.Name(), "Duplicate file found. Please delete one.")
found[fid] = struct{}{}
lf := &logFile{
fid: uint32(fid),
path: vlog.fpath(uint32(fid)),
loadingMode: vlog.opt.ValueLogLoadingMode,
vlog.filesMap[uint32(fid)] = lf
if vlog.maxFid < uint32(fid) {
vlog.maxFid = uint32(fid)
return nil
func (vlog *valueLog) createVlogFile(fid uint32) (*logFile, error) {
path := vlog.fpath(fid)
lf := &logFile{
fid: fid,
path: path,
loadingMode: vlog.opt.ValueLogLoadingMode,
// writableLogOffset is only written by write func, by read by Read func.
// To avoid a race condition, all reads and updates to this variable must be
// done via atomics.
atomic.StoreUint32(&vlog.writableLogOffset, 0)
vlog.numEntriesWritten = 0
var err error
if lf.fd, err = y.CreateSyncedFile(path, vlog.opt.SyncWrites); err != nil {
return nil, errFile(err, lf.path, "Create value log file")
removeFile := func() {
// Remove the file so that we don't get an error when createVlogFile is
// called for the same fid, again. This could happen if there is an
// transient error because of which we couldn't create a new file
// and the second attempt to create the file succeeds.
if err = syncDir(vlog.dirPath); err != nil {
return nil, errFile(err, vlog.dirPath, "Sync value log dir")
if err = lf.mmap(2 * vlog.opt.ValueLogFileSize); err != nil {
return nil, errFile(err, lf.path, "Mmap value log file")
vlog.filesMap[fid] = lf
return lf, nil
func errFile(err error, path string, msg string) error {
return fmt.Errorf("%s. Path=%s. Error=%v", msg, path, err)
func (vlog *valueLog) replayLog(lf *logFile, offset uint32, replayFn logEntry) error {
fi, err := lf.fd.Stat()
if err != nil {
return errFile(err, lf.path, "Unable to run file.Stat")
// Alright, let's iterate now.
endOffset, err := vlog.iterate(lf, offset, replayFn)
if err != nil {
return errFile(err, lf.path, "Unable to replay logfile")
if int64(endOffset) == fi.Size() {
return nil
// End offset is different from file size. So, we should truncate the file
// to that size.
y.AssertTrue(int64(endOffset) <= fi.Size())
if !vlog.opt.Truncate {
return ErrTruncateNeeded
// The entire file should be truncated (i.e. it should be deleted).
// If fid == maxFid then it's okay to truncate the entire file since it will be
// used for future additions. Also, it's okay if the last file has size zero.
// We mmap 2*opt.ValueLogSize for the last file. See vlog.Open() function
if endOffset == 0 && lf.fid != vlog.maxFid {
return errDeleteVlogFile
if err := lf.fd.Truncate(int64(endOffset)); err != nil {
return errFile(err, lf.path, fmt.Sprintf(
"Truncation needed at offset %d. Can be done manually as well.", endOffset))
return nil
// init initializes the value log struct. This initialization needs to happen
// before compactions start.
func (vlog *valueLog) init(db *DB) {
vlog.opt = db.opt
vlog.db = db
vlog.dirPath = vlog.opt.ValueDir
vlog.elog = y.NoEventLog
if db.opt.EventLogging {
vlog.elog = trace.NewEventLog("Badger", "Valuelog")
vlog.garbageCh = make(chan struct{}, 1) // Only allow one GC at a time.
vlog.lfDiscardStats = &lfDiscardStats{
m: make(map[uint32]int64),
closer: y.NewCloser(1),
flushChan: make(chan map[uint32]int64, 16),
func (vlog *valueLog) open(db *DB, ptr valuePointer, replayFn logEntry) error {
go vlog.flushDiscardStats()
if err := vlog.populateFilesMap(); err != nil {
return err
// If no files are found, then create a new file.
if len(vlog.filesMap) == 0 {
_, err := vlog.createVlogFile(0)
return err
fids := vlog.sortedFids()
for _, fid := range fids {
lf, ok := vlog.filesMap[fid]
var flags uint32
switch {
case vlog.opt.ReadOnly:
// If we have read only, we don't need SyncWrites.
flags |= y.ReadOnly
// Set sync flag.
case vlog.opt.SyncWrites:
flags |= y.Sync
// We cannot mmap the files upfront here. Windows does not like mmapped files to be
// truncated. We might need to truncate files during a replay.
if err :=, flags); err != nil {
return err
// This file is before the value head pointer. So, we don't need to
// replay it, and can just open it in readonly mode.
if fid < ptr.Fid {
// Mmap the file here, we don't need to replay it.
if err := lf.mmap(int64(lf.size)); err != nil {
return err
var offset uint32
if fid == ptr.Fid {
offset = ptr.Offset + ptr.Len
vlog.db.opt.Infof("Replaying file id: %d at offset: %d\n", fid, offset)
now := time.Now()
// Replay and possible truncation done. Now we can open the file as per
// user specified options.
if err := vlog.replayLog(lf, offset, replayFn); err != nil {
// Log file is corrupted. Delete it.
if err == errDeleteVlogFile {
delete(vlog.filesMap, fid)
// Close the fd of the file before deleting the file otherwise windows complaints.
if err := lf.fd.Close(); err != nil {
return errors.Wrapf(err, "failed to close vlog file %s", lf.fd.Name())
path := vlog.fpath(lf.fid)
if err := os.Remove(path); err != nil {
return y.Wrapf(err, "failed to delete empty value log file: %q", path)
return err
vlog.db.opt.Infof("Replay took: %s\n", time.Since(now))
if fid < vlog.maxFid {
// This file has been replayed. It can now be mmapped.
// For maxFid, the mmap would be done by the specially written code below.
if err := lf.mmap(int64(lf.size)); err != nil {
return err
// Seek to the end to start writing.
last, ok := vlog.filesMap[vlog.maxFid]
lastOffset, err := last.fd.Seek(0, io.SeekEnd)
if err != nil {
return errFile(err, last.path, "file.Seek to end")
vlog.writableLogOffset = uint32(lastOffset)
// Update the head to point to the updated tail. Otherwise, even after doing a successful
// replay and closing the DB, the value log head does not get updated, which causes the replay
// to happen repeatedly.
vlog.db.vhead = valuePointer{Fid: vlog.maxFid, Offset: uint32(lastOffset)}
// Map the file if needed. When we create a file, it is automatically mapped.
if err = last.mmap(2 * db.opt.ValueLogFileSize); err != nil {
return errFile(err, last.path, "Map log file")
if err := vlog.populateDiscardStats(); err != nil {
// Print the error and continue. We don't want to prevent value log open if there's an error
// with the fetching discards stats.
db.opt.Errorf("Failed to populate discard stats: %s", err)
return nil
func (lf *logFile) open(path string, flags uint32) error {
var err error
if lf.fd, err = y.OpenExistingFile(path, flags); err != nil {
return y.Wrapf(err, "Error while opening file in logfile %s", path)
fi, err := lf.fd.Stat()
if err != nil {
return errFile(err, lf.path, "Unable to run file.Stat")
sz := fi.Size()
sz <= math.MaxUint32,
"file size: %d greater than %d",
uint32(sz), uint32(math.MaxUint32),
lf.size = uint32(sz)
return nil
func (vlog *valueLog) Close() error {
// close flushDiscardStats.
vlog.elog.Printf("Stopping garbage collection of values.")
defer vlog.elog.Finish()
var err error
for id, f := range vlog.filesMap {
f.lock.Lock() // We wont release the lock.
if munmapErr := f.munmap(); munmapErr != nil && err == nil {
err = munmapErr
maxFid := atomic.LoadUint32(&vlog.maxFid)
if !vlog.opt.ReadOnly && id == maxFid {
// truncate writable log file to correct offset.
if truncErr := f.fd.Truncate(
int64(vlog.woffset())); truncErr != nil && err == nil {
err = truncErr
if closeErr := f.fd.Close(); closeErr != nil && err == nil {
err = closeErr
return err
// sortedFids returns the file id's not pending deletion, sorted. Assumes we have shared access to
// filesMap.
func (vlog *valueLog) sortedFids() []uint32 {
toBeDeleted := make(map[uint32]struct{})
for _, fid := range vlog.filesToBeDeleted {
toBeDeleted[fid] = struct{}{}
ret := make([]uint32, 0, len(vlog.filesMap))
for fid := range vlog.filesMap {
if _, ok := toBeDeleted[fid]; !ok {
ret = append(ret, fid)
sort.Slice(ret, func(i, j int) bool {
return ret[i] < ret[j]
return ret
type request struct {
// Input values
Entries []*Entry
// Output values and wait group stuff below
Ptrs []valuePointer
Wg sync.WaitGroup
Err error
ref int32
func (req *request) reset() {
req.Entries = req.Entries[:0]
req.Ptrs = req.Ptrs[:0]
req.Wg = sync.WaitGroup{}
req.Err = nil
req.ref = 0
func (req *request) IncrRef() {
atomic.AddInt32(&req.ref, 1)
func (req *request) DecrRef() {
nRef := atomic.AddInt32(&req.ref, -1)
if nRef > 0 {
req.Entries = nil
func (req *request) Wait() error {
err := req.Err
req.DecrRef() // DecrRef after writing to DB.
return err
type requests []*request
func (reqs requests) DecrRef() {
for _, req := range reqs {
func (reqs requests) IncrRef() {
for _, req := range reqs {
// sync function syncs content of latest value log file to disk. Syncing of value log directory is
// not required here as it happens every time a value log file rotation happens(check createVlogFile
// function). During rotation, previous value log file also gets synced to disk. It only syncs file
// if fid >= vlog.maxFid. In some cases such as replay(while opening db), it might be called with
// fid < vlog.maxFid. To sync irrespective of file id just call it with math.MaxUint32.
func (vlog *valueLog) sync(fid uint32) error {
if vlog.opt.SyncWrites {
return nil
maxFid := atomic.LoadUint32(&vlog.maxFid)
// During replay it is possible to get sync call with fid less than maxFid.
// Because older file has already been synced, we can return from here.
if fid < maxFid || len(vlog.filesMap) == 0 {
return nil
curlf := vlog.filesMap[maxFid]
// Sometimes it is possible that vlog.maxFid has been increased but file creation
// with same id is still in progress and this function is called. In those cases
// entry for the file might not be present in vlog.filesMap.
if curlf == nil {
return nil
err := curlf.sync()
return err
func (vlog *valueLog) woffset() uint32 {
return atomic.LoadUint32(&vlog.writableLogOffset)
// write is thread-unsafe by design and should not be called concurrently.
func (vlog *valueLog) write(reqs []*request) error {
maxFid := atomic.LoadUint32(&vlog.maxFid)
curlf := vlog.filesMap[maxFid]
var buf bytes.Buffer
flushWrites := func() error {
if buf.Len() == 0 {
return nil
vlog.elog.Printf("Flushing buffer of size %d to vlog", buf.Len())
n, err := curlf.fd.Write(buf.Bytes())
if err != nil {
return errors.Wrapf(err, "Unable to write to value log file: %q", curlf.path)
atomic.AddUint32(&vlog.writableLogOffset, uint32(n))
atomic.StoreUint32(&curlf.size, vlog.writableLogOffset)
return nil
toDisk := func() error {
if err := flushWrites(); err != nil {
return err
if vlog.woffset() > uint32(vlog.opt.ValueLogFileSize) ||
vlog.numEntriesWritten > vlog.opt.ValueLogMaxEntries {
if err := curlf.doneWriting(vlog.woffset()); err != nil {
return err
newid := atomic.AddUint32(&vlog.maxFid, 1)
y.AssertTruef(newid > 0, "newid has overflown uint32: %v", newid)
newlf, err := vlog.createVlogFile(newid)
if err != nil {
return err
curlf = newlf
atomic.AddInt32(&vlog.db.logRotates, 1)
return nil
for i := range reqs {
b := reqs[i]
b.Ptrs = b.Ptrs[:0]
var written int
for j := range b.Entries {
e := b.Entries[j]
if e.skipVlog {
b.Ptrs = append(b.Ptrs, valuePointer{})
var p valuePointer
p.Fid = curlf.fid
// Use the offset including buffer length so far.
p.Offset = vlog.woffset() + uint32(buf.Len())
plen, err := encodeEntry(e, &buf) // Now encode the entry into buffer.
if err != nil {
return err
p.Len = uint32(plen)
b.Ptrs = append(b.Ptrs, p)
// It is possible that the size of the buffer grows beyond the max size of the value
// log (this happens when a transaction contains entries with large value sizes) and
// badger might run into out of memory errors. We flush the buffer here if it's size
// grows beyond the max value log size.
if int64(buf.Len()) > vlog.db.opt.ValueLogFileSize {
if err := flushWrites(); err != nil {
return err
vlog.numEntriesWritten += uint32(written)
// We write to disk here so that all entries that are part of the same transaction are
// written to the same vlog file.
writeNow :=
vlog.woffset()+uint32(buf.Len()) > uint32(vlog.opt.ValueLogFileSize) ||
vlog.numEntriesWritten > uint32(vlog.opt.ValueLogMaxEntries)
if writeNow {
if err := toDisk(); err != nil {
return err
return toDisk()
// Gets the logFile and acquires and RLock() for the mmap. You must call RUnlock on the file
// (if non-nil)
func (vlog *valueLog) getFileRLocked(fid uint32) (*logFile, error) {
defer vlog.filesLock.RUnlock()
ret, ok := vlog.filesMap[fid]
if !ok {
// log file has gone away, will need to retry the operation.
return nil, ErrRetry
return ret, nil
// Read reads the value log at a given location.
// TODO: Make this read private.
func (vlog *valueLog) Read(vp valuePointer, s *y.Slice) ([]byte, func(), error) {
// Check for valid offset if we are reading from writable log.
maxFid := atomic.LoadUint32(&vlog.maxFid)
if vp.Fid == maxFid && vp.Offset >= vlog.woffset() {
return nil, nil, errors.Errorf(
"Invalid value pointer offset: %d greater than current offset: %d",
vp.Offset, vlog.woffset())
buf, cb, err := vlog.readValueBytes(vp, s)
if err != nil {
return nil, cb, err
if vlog.opt.VerifyValueChecksum {
hash := crc32.New(y.CastagnoliCrcTable)
if _, err := hash.Write(buf[:len(buf)-crc32.Size]); err != nil {
return nil, nil, errors.Wrapf(err, "failed to write hash for vp %+v", vp)
// Fetch checksum from the end of the buffer.
checksum := buf[len(buf)-crc32.Size:]
res := binary.BigEndian.Uint32(checksum)
if hash.Sum32() != res {
return nil, nil, errors.Errorf("checksum mismatch Error: value corrupted for vp: %+v", vp)
var h header
n := uint32(headerBufSize) + h.klen
return buf[n : n+h.vlen], cb, nil
func (vlog *valueLog) readValueBytes(vp valuePointer, s *y.Slice) ([]byte, func(), error) {
lf, err := vlog.getFileRLocked(vp.Fid)
if err != nil {
return nil, nil, err
buf, err :=, s)
if vlog.opt.ValueLogLoadingMode == options.MemoryMap {
return buf, lf.lock.RUnlock, err
// If we are using File I/O we unlock the file immediately
// and return an empty function as callback.
return buf, nil, err
// Test helper
func valueBytesToEntry(buf []byte) (e Entry) {
var h header
n := uint32(headerBufSize)
e.Key = buf[n : n+h.klen]
n += h.klen
e.meta = h.meta
e.UserMeta = h.userMeta
e.Value = buf[n : n+h.vlen]
func (vlog *valueLog) pickLog(head valuePointer, tr trace.Trace) (files []*logFile) {
defer vlog.filesLock.RUnlock()
fids := vlog.sortedFids()
if len(fids) <= 1 {
tr.LazyPrintf("Only one or less value log file.")
return nil
} else if head.Fid == 0 {
tr.LazyPrintf("Head pointer is at zero.")
return nil
// Pick a candidate that contains the largest amount of discardable data
candidate := struct {
fid uint32
discard int64
}{math.MaxUint32, 0}
for _, fid := range fids {
if fid >= head.Fid {
if vlog.lfDiscardStats.m[fid] > candidate.discard {
candidate.fid = fid
candidate.discard = vlog.lfDiscardStats.m[fid]
if candidate.fid != math.MaxUint32 { // Found a candidate
tr.LazyPrintf("Found candidate via discard stats: %v", candidate)
files = append(files, vlog.filesMap[candidate.fid])
} else {
tr.LazyPrintf("Could not find candidate via discard stats. Randomly picking one.")
// Fallback to randomly picking a log file
var idxHead int
for i, fid := range fids {
if fid == head.Fid {
idxHead = i
if idxHead == 0 { // Not found or first file
tr.LazyPrintf("Could not find any file.")
return nil
idx := rand.Intn(idxHead) // Dont include head.Fid. We pick a random file before it.
if idx > 0 {
idx = rand.Intn(idx + 1) // Another level of rand to favor smaller fids.
tr.LazyPrintf("Randomly chose fid: %d", fids[idx])
files = append(files, vlog.filesMap[fids[idx]])
return files
func discardEntry(e Entry, vs y.ValueStruct, db *DB) bool {
if vs.Version != y.ParseTs(e.Key) {
// Version not found. Discard.
return true
if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
return true
if (vs.Meta & bitValuePointer) == 0 {
// Key also stores the value in LSM. Discard.
return true
if (vs.Meta & bitFinTxn) > 0 {
// Just a txn finish entry. Discard.
return true
if bytes.HasPrefix(e.Key, badgerMove) {
// Verify the actual key entry without the badgerPrefix has not been deleted.
// If this is not done the badgerMove entry will be kept forever moving from
// vlog to vlog during rewrites.
avs, err := db.get(e.Key[len(badgerMove):])
if err != nil {
return false
return avs.Version == 0
return false
func (vlog *valueLog) doRunGC(lf *logFile, discardRatio float64, tr trace.Trace) (err error) {
// Update stats before exiting
defer func() {
if err == nil {
delete(vlog.lfDiscardStats.m, lf.fid)
type reason struct {
total float64
discard float64
count int
fi, err := lf.fd.Stat()
if err != nil {
tr.LazyPrintf("Error while finding file size: %v", err)
return err
// Set up the sampling window sizes.
sizeWindow := float64(fi.Size()) * 0.1 // 10% of the file as window.
sizeWindowM := sizeWindow / (1 << 20) // in MBs.
countWindow := int(float64(vlog.opt.ValueLogMaxEntries) * 0.01) // 1% of num entries.
tr.LazyPrintf("Size window: %5.2f. Count window: %d.", sizeWindow, countWindow)
// Pick a random start point for the log.
skipFirstM := float64(rand.Int63n(fi.Size())) // Pick a random starting location.
skipFirstM -= sizeWindow // Avoid hitting EOF by moving back by window.
skipFirstM /= float64(mi) // Convert to MBs.
tr.LazyPrintf("Skip first %5.2f MB of file of size: %d MB", skipFirstM, fi.Size()/mi)
var skipped float64
var r reason
start := time.Now()
y.AssertTrue(vlog.db != nil)
s := new(y.Slice)
var numIterations int
_, err = vlog.iterate(lf, 0, func(e Entry, vp valuePointer) error {
esz := float64(vp.Len) / (1 << 20) // in MBs.
if skipped < skipFirstM {
skipped += esz
return nil
// Sample until we reach the window sizes or exceed 10 seconds.
if r.count > countWindow {
tr.LazyPrintf("Stopping sampling after %d entries.", countWindow)
return errStop
if > sizeWindowM {
tr.LazyPrintf("Stopping sampling after reaching window size.")
return errStop
if time.Since(start) > 10*time.Second {
tr.LazyPrintf("Stopping sampling after 10 seconds.")
return errStop
} += esz
vs, err := vlog.db.get(e.Key)
if err != nil {
return err
if discardEntry(e, vs, vlog.db) {
r.discard += esz
return nil
// Value is still present in value log.
y.AssertTrue(len(vs.Value) > 0)
if vp.Fid > lf.fid {
// Value is present in a later log. Discard.
r.discard += esz
return nil
if vp.Offset > e.offset {
// Value is present in a later offset, but in the same log.
r.discard += esz
return nil
if vp.Fid == lf.fid && vp.Offset == e.offset {
// This is still the active entry. This would need to be rewritten.
} else {
vlog.elog.Printf("Reason=%+v\n", r)
buf, cb, err := vlog.readValueBytes(vp, s)
if err != nil {
return errStop
ne := valueBytesToEntry(buf)
ne.offset = vp.Offset
ne.print("Latest Entry Header in LSM")
e.print("Latest Entry in Log")
return errors.Errorf("This shouldn't happen. Latest Pointer:%+v. Meta:%v.",
vp, vs.Meta)
return nil
if err != nil {
tr.LazyPrintf("Error while iterating for RunGC: %v", err)
return err
tr.LazyPrintf("Fid: %d. Skipped: %5.2fMB Num iterations: %d. Data status=%+v\n",
lf.fid, skipped, numIterations, r)
// If we couldn't sample at least a 1000 KV pairs or at least 75% of the window size,
// and what we can discard is below the threshold, we should skip the rewrite.
if (r.count < countWindow && < sizeWindowM*0.75) || r.discard < discardRatio* {
tr.LazyPrintf("Skipping GC on fid: %d", lf.fid)
return ErrNoRewrite
if err = vlog.rewrite(lf, tr); err != nil {
return err
tr.LazyPrintf("Done rewriting.")
return nil
func (vlog *valueLog) waitOnGC(lc *y.Closer) {
defer lc.Done()
<-lc.HasBeenClosed() // Wait for lc to be closed.
// Block any GC in progress to finish, and don't allow any more writes to runGC by filling up
// the channel of size 1.
vlog.garbageCh <- struct{}{}
func (vlog *valueLog) runGC(discardRatio float64, head valuePointer) error {
select {
case vlog.garbageCh <- struct{}{}:
// Pick a log file for GC.
tr := trace.New("Badger.ValueLog", "GC")
defer func() {
var err error
files := vlog.pickLog(head, tr)
if len(files) == 0 {
tr.LazyPrintf("PickLog returned zero results.")
return ErrNoRewrite
tried := make(map[uint32]bool)
for _, lf := range files {
if _, done := tried[lf.fid]; done {
tried[lf.fid] = true
err = vlog.doRunGC(lf, discardRatio, tr)
if err == nil {
return vlog.deleteMoveKeysFor(lf.fid, tr)
return err
return ErrRejected
func (vlog *valueLog) updateDiscardStats(stats map[uint32]int64) {
select {
case vlog.lfDiscardStats.flushChan <- stats:
vlog.opt.Warningf("updateDiscardStats called: discard stats flushChan full, " +
"returning without pushing to flushChan")
func (vlog *valueLog) flushDiscardStats() {
defer vlog.lfDiscardStats.closer.Done()
mergeStats := func(stats map[uint32]int64) ([]byte, error) {
defer vlog.lfDiscardStats.Unlock()
for fid, count := range stats {
vlog.lfDiscardStats.m[fid] += count
if vlog.lfDiscardStats.updatesSinceFlush > discardStatsFlushThreshold {
encodedDS, err := json.Marshal(vlog.lfDiscardStats.m)
if err != nil {
return nil, err
vlog.lfDiscardStats.updatesSinceFlush = 0
return encodedDS, nil
return nil, nil
process := func(stats map[uint32]int64) error {
encodedDS, err := mergeStats(stats)
if err != nil || encodedDS == nil {
return err
entries := []*Entry{{
Key: y.KeyWithTs(lfDiscardStatsKey, 1),
Value: encodedDS,
req, err := vlog.db.sendToWriteCh(entries)
// No special handling of ErrBlockedWrites is required as err is just logged in
// for loop below.
if err != nil {
return errors.Wrapf(err, "failed to push discard stats to write channel")
return req.Wait()
closer := vlog.lfDiscardStats.closer
for {
select {
case <-closer.HasBeenClosed():
// For simplicity just return without processing already present in stats in flushChan.
case stats := <-vlog.lfDiscardStats.flushChan:
if err := process(stats); err != nil {
vlog.opt.Errorf("unable to process discardstats with error: %s", err)
// populateDiscardStats populates vlog.lfDiscardStats.
// This function will be called while initializing valueLog.
func (vlog *valueLog) populateDiscardStats() error {
key := y.KeyWithTs(lfDiscardStatsKey, math.MaxUint64)
var statsMap map[uint32]int64
var val []byte
var vp valuePointer
for {
vs, err := vlog.db.get(key)
if err != nil {
return err
// Value doesn't exist.
if vs.Meta == 0 && len(vs.Value) == 0 {
vlog.opt.Debugf("Value log discard stats empty")
return nil
// Entry stored in LSM tree.
if vs.Meta&bitValuePointer == 0 {
val = y.SafeCopy(val, vs.Value)
// Read entry from value log.
result, cb, err := vlog.Read(vp, new(y.Slice))
val = y.SafeCopy(val, result)
// The result is stored in val. We can break the loop from here.
if err == nil {
if err != ErrRetry {
return err
// If we're at this point it means we haven't found the value yet and if the current key has
// badger move prefix, we should break from here since we've already tried the original key
// and the key with move prefix. "val" would be empty since we haven't found the value yet.
if bytes.HasPrefix(key, badgerMove) {
// If we're at this point it means the discard stats key was moved by the GC and the actual
// entry is the one prefixed by badger move key.
// Prepend existing key with badger move and search for the key.
key = append(badgerMove, key...)
if len(val) == 0 {
return nil
if err := json.Unmarshal(val, &statsMap); err != nil {
return errors.Wrapf(err, "failed to unmarshal discard stats")
vlog.opt.Debugf("Value Log Discard stats: %v", statsMap)
vlog.lfDiscardStats.flushChan <- statsMap
return nil