1560 lines
43 KiB
Go
1560 lines
43 KiB
Go
/*
|
|
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package badger
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/binary"
|
|
"expvar"
|
|
"io"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/dgraph-io/badger/options"
|
|
"github.com/dgraph-io/badger/pb"
|
|
"github.com/dgraph-io/badger/skl"
|
|
"github.com/dgraph-io/badger/table"
|
|
"github.com/dgraph-io/badger/y"
|
|
humanize "github.com/dustin/go-humanize"
|
|
"github.com/pkg/errors"
|
|
"golang.org/x/net/trace"
|
|
)
|
|
|
|
var (
|
|
badgerPrefix = []byte("!badger!") // Prefix for internal keys used by badger.
|
|
head = []byte("!badger!head") // For storing value offset for replay.
|
|
txnKey = []byte("!badger!txn") // For indicating end of entries in txn.
|
|
badgerMove = []byte("!badger!move") // For key-value pairs which got moved during GC.
|
|
lfDiscardStatsKey = []byte("!badger!discard") // For storing lfDiscardStats
|
|
)
|
|
|
|
type closers struct {
|
|
updateSize *y.Closer
|
|
compactors *y.Closer
|
|
memtable *y.Closer
|
|
writes *y.Closer
|
|
valueGC *y.Closer
|
|
pub *y.Closer
|
|
}
|
|
|
|
// DB provides the various functions required to interact with Badger.
|
|
// DB is thread-safe.
|
|
type DB struct {
|
|
sync.RWMutex // Guards list of inmemory tables, not individual reads and writes.
|
|
|
|
dirLockGuard *directoryLockGuard
|
|
// nil if Dir and ValueDir are the same
|
|
valueDirGuard *directoryLockGuard
|
|
|
|
closers closers
|
|
elog trace.EventLog
|
|
mt *skl.Skiplist // Our latest (actively written) in-memory table
|
|
imm []*skl.Skiplist // Add here only AFTER pushing to flushChan.
|
|
opt Options
|
|
manifest *manifestFile
|
|
lc *levelsController
|
|
vlog valueLog
|
|
vhead valuePointer // less than or equal to a pointer to the last vlog value put into mt
|
|
writeCh chan *request
|
|
flushChan chan flushTask // For flushing memtables.
|
|
closeOnce sync.Once // For closing DB only once.
|
|
|
|
// Number of log rotates since the last memtable flush. We will access this field via atomic
|
|
// functions. Since we are not going to use any 64bit atomic functions, there is no need for
|
|
// 64 bit alignment of this struct(see #311).
|
|
logRotates int32
|
|
|
|
blockWrites int32
|
|
|
|
orc *oracle
|
|
|
|
pub *publisher
|
|
}
|
|
|
|
const (
|
|
kvWriteChCapacity = 1000
|
|
)
|
|
|
|
func (db *DB) replayFunction() func(Entry, valuePointer) error {
|
|
type txnEntry struct {
|
|
nk []byte
|
|
v y.ValueStruct
|
|
}
|
|
|
|
var txn []txnEntry
|
|
var lastCommit uint64
|
|
|
|
toLSM := func(nk []byte, vs y.ValueStruct) {
|
|
for err := db.ensureRoomForWrite(); err != nil; err = db.ensureRoomForWrite() {
|
|
db.elog.Printf("Replay: Making room for writes")
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
db.mt.Put(nk, vs)
|
|
}
|
|
|
|
first := true
|
|
return func(e Entry, vp valuePointer) error { // Function for replaying.
|
|
if first {
|
|
db.elog.Printf("First key=%q\n", e.Key)
|
|
}
|
|
first = false
|
|
db.orc.Lock()
|
|
if db.orc.nextTxnTs < y.ParseTs(e.Key) {
|
|
db.orc.nextTxnTs = y.ParseTs(e.Key)
|
|
}
|
|
db.orc.Unlock()
|
|
|
|
nk := make([]byte, len(e.Key))
|
|
copy(nk, e.Key)
|
|
var nv []byte
|
|
meta := e.meta
|
|
if db.shouldWriteValueToLSM(e) {
|
|
nv = make([]byte, len(e.Value))
|
|
copy(nv, e.Value)
|
|
} else {
|
|
nv = make([]byte, vptrSize)
|
|
vp.Encode(nv)
|
|
meta = meta | bitValuePointer
|
|
}
|
|
// Update vhead. If the crash happens while replay was in progess
|
|
// and the head is not updated, we will end up replaying all the
|
|
// files starting from file zero, again.
|
|
db.updateHead([]valuePointer{vp})
|
|
|
|
v := y.ValueStruct{
|
|
Value: nv,
|
|
Meta: meta,
|
|
UserMeta: e.UserMeta,
|
|
ExpiresAt: e.ExpiresAt,
|
|
}
|
|
|
|
if e.meta&bitFinTxn > 0 {
|
|
txnTs, err := strconv.ParseUint(string(e.Value), 10, 64)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "Unable to parse txn fin: %q", e.Value)
|
|
}
|
|
y.AssertTrue(lastCommit == txnTs)
|
|
y.AssertTrue(len(txn) > 0)
|
|
// Got the end of txn. Now we can store them.
|
|
for _, t := range txn {
|
|
toLSM(t.nk, t.v)
|
|
}
|
|
txn = txn[:0]
|
|
lastCommit = 0
|
|
|
|
} else if e.meta&bitTxn > 0 {
|
|
txnTs := y.ParseTs(nk)
|
|
if lastCommit == 0 {
|
|
lastCommit = txnTs
|
|
}
|
|
if lastCommit != txnTs {
|
|
db.opt.Warningf("Found an incomplete txn at timestamp %d. Discarding it.\n",
|
|
lastCommit)
|
|
txn = txn[:0]
|
|
lastCommit = txnTs
|
|
}
|
|
te := txnEntry{nk: nk, v: v}
|
|
txn = append(txn, te)
|
|
|
|
} else {
|
|
// This entry is from a rewrite.
|
|
toLSM(nk, v)
|
|
|
|
// We shouldn't get this entry in the middle of a transaction.
|
|
y.AssertTrue(lastCommit == 0)
|
|
y.AssertTrue(len(txn) == 0)
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Open returns a new DB object.
|
|
func Open(opt Options) (db *DB, err error) {
|
|
opt.maxBatchSize = (15 * opt.MaxTableSize) / 100
|
|
opt.maxBatchCount = opt.maxBatchSize / int64(skl.MaxNodeSize)
|
|
|
|
if opt.ValueThreshold > ValueThresholdLimit {
|
|
return nil, ErrValueThreshold
|
|
}
|
|
|
|
if opt.ReadOnly {
|
|
// Can't truncate if the DB is read only.
|
|
opt.Truncate = false
|
|
// Do not perform compaction in read only mode.
|
|
opt.CompactL0OnClose = false
|
|
}
|
|
|
|
for _, path := range []string{opt.Dir, opt.ValueDir} {
|
|
dirExists, err := exists(path)
|
|
if err != nil {
|
|
return nil, y.Wrapf(err, "Invalid Dir: %q", path)
|
|
}
|
|
if !dirExists {
|
|
if opt.ReadOnly {
|
|
return nil, errors.Errorf("Cannot find directory %q for read-only open", path)
|
|
}
|
|
// Try to create the directory
|
|
err = os.Mkdir(path, 0700)
|
|
if err != nil {
|
|
return nil, y.Wrapf(err, "Error Creating Dir: %q", path)
|
|
}
|
|
}
|
|
}
|
|
var dirLockGuard, valueDirLockGuard *directoryLockGuard
|
|
if !opt.BypassLockGuard {
|
|
absDir, err := filepath.Abs(opt.Dir)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
absValueDir, err := filepath.Abs(opt.ValueDir)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
dirLockGuard, err = acquireDirectoryLock(opt.Dir, lockFile, opt.ReadOnly)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if dirLockGuard != nil {
|
|
_ = dirLockGuard.release()
|
|
}
|
|
}()
|
|
if absValueDir != absDir {
|
|
valueDirLockGuard, err = acquireDirectoryLock(opt.ValueDir, lockFile, opt.ReadOnly)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if valueDirLockGuard != nil {
|
|
_ = valueDirLockGuard.release()
|
|
}
|
|
}()
|
|
}
|
|
}
|
|
if !(opt.ValueLogFileSize <= 2<<30 && opt.ValueLogFileSize >= 1<<20) {
|
|
return nil, ErrValueLogSize
|
|
}
|
|
if !(opt.ValueLogLoadingMode == options.FileIO ||
|
|
opt.ValueLogLoadingMode == options.MemoryMap) {
|
|
return nil, ErrInvalidLoadingMode
|
|
}
|
|
manifestFile, manifest, err := openOrCreateManifestFile(opt.Dir, opt.ReadOnly)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if manifestFile != nil {
|
|
_ = manifestFile.close()
|
|
}
|
|
}()
|
|
|
|
elog := y.NoEventLog
|
|
if opt.EventLogging {
|
|
elog = trace.NewEventLog("Badger", "DB")
|
|
}
|
|
|
|
db = &DB{
|
|
imm: make([]*skl.Skiplist, 0, opt.NumMemtables),
|
|
flushChan: make(chan flushTask, opt.NumMemtables),
|
|
writeCh: make(chan *request, kvWriteChCapacity),
|
|
opt: opt,
|
|
manifest: manifestFile,
|
|
elog: elog,
|
|
dirLockGuard: dirLockGuard,
|
|
valueDirGuard: valueDirLockGuard,
|
|
orc: newOracle(opt),
|
|
pub: newPublisher(),
|
|
}
|
|
|
|
// Calculate initial size.
|
|
db.calculateSize()
|
|
db.closers.updateSize = y.NewCloser(1)
|
|
go db.updateSize(db.closers.updateSize)
|
|
db.mt = skl.NewSkiplist(arenaSize(opt))
|
|
|
|
// newLevelsController potentially loads files in directory.
|
|
if db.lc, err = newLevelsController(db, &manifest); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Initialize vlog struct.
|
|
db.vlog.init(db)
|
|
|
|
if !opt.ReadOnly {
|
|
db.closers.compactors = y.NewCloser(1)
|
|
db.lc.startCompact(db.closers.compactors)
|
|
|
|
db.closers.memtable = y.NewCloser(1)
|
|
go func() {
|
|
_ = db.flushMemtable(db.closers.memtable) // Need levels controller to be up.
|
|
}()
|
|
}
|
|
|
|
headKey := y.KeyWithTs(head, math.MaxUint64)
|
|
// Need to pass with timestamp, lsm get removes the last 8 bytes and compares key
|
|
vs, err := db.get(headKey)
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "Retrieving head")
|
|
}
|
|
db.orc.nextTxnTs = vs.Version
|
|
var vptr valuePointer
|
|
if len(vs.Value) > 0 {
|
|
vptr.Decode(vs.Value)
|
|
}
|
|
|
|
replayCloser := y.NewCloser(1)
|
|
go db.doWrites(replayCloser)
|
|
|
|
if err = db.vlog.open(db, vptr, db.replayFunction()); err != nil {
|
|
return db, err
|
|
}
|
|
replayCloser.SignalAndWait() // Wait for replay to be applied first.
|
|
|
|
// Let's advance nextTxnTs to one more than whatever we observed via
|
|
// replaying the logs.
|
|
db.orc.txnMark.Done(db.orc.nextTxnTs)
|
|
// In normal mode, we must update readMark so older versions of keys can be removed during
|
|
// compaction when run in offline mode via the flatten tool.
|
|
db.orc.readMark.Done(db.orc.nextTxnTs)
|
|
db.orc.incrementNextTs()
|
|
|
|
db.writeCh = make(chan *request, kvWriteChCapacity)
|
|
db.closers.writes = y.NewCloser(1)
|
|
go db.doWrites(db.closers.writes)
|
|
|
|
db.closers.valueGC = y.NewCloser(1)
|
|
go db.vlog.waitOnGC(db.closers.valueGC)
|
|
|
|
db.closers.pub = y.NewCloser(1)
|
|
go db.pub.listenForUpdates(db.closers.pub)
|
|
|
|
valueDirLockGuard = nil
|
|
dirLockGuard = nil
|
|
manifestFile = nil
|
|
return db, nil
|
|
}
|
|
|
|
// Close closes a DB. It's crucial to call it to ensure all the pending updates make their way to
|
|
// disk. Calling DB.Close() multiple times would still only close the DB once.
|
|
func (db *DB) Close() error {
|
|
var err error
|
|
db.closeOnce.Do(func() {
|
|
err = db.close()
|
|
})
|
|
return err
|
|
}
|
|
|
|
func (db *DB) close() (err error) {
|
|
db.elog.Printf("Closing database")
|
|
|
|
atomic.StoreInt32(&db.blockWrites, 1)
|
|
|
|
// Stop value GC first.
|
|
db.closers.valueGC.SignalAndWait()
|
|
|
|
// Stop writes next.
|
|
db.closers.writes.SignalAndWait()
|
|
|
|
// Don't accept any more write.
|
|
close(db.writeCh)
|
|
|
|
db.closers.pub.SignalAndWait()
|
|
|
|
// Now close the value log.
|
|
if vlogErr := db.vlog.Close(); vlogErr != nil {
|
|
err = errors.Wrap(vlogErr, "DB.Close")
|
|
}
|
|
|
|
// Make sure that block writer is done pushing stuff into memtable!
|
|
// Otherwise, you will have a race condition: we are trying to flush memtables
|
|
// and remove them completely, while the block / memtable writer is still
|
|
// trying to push stuff into the memtable. This will also resolve the value
|
|
// offset problem: as we push into memtable, we update value offsets there.
|
|
if !db.mt.Empty() {
|
|
db.elog.Printf("Flushing memtable")
|
|
for {
|
|
pushedFlushTask := func() bool {
|
|
db.Lock()
|
|
defer db.Unlock()
|
|
y.AssertTrue(db.mt != nil)
|
|
select {
|
|
case db.flushChan <- flushTask{mt: db.mt, vptr: db.vhead}:
|
|
db.imm = append(db.imm, db.mt) // Flusher will attempt to remove this from s.imm.
|
|
db.mt = nil // Will segfault if we try writing!
|
|
db.elog.Printf("pushed to flush chan\n")
|
|
return true
|
|
default:
|
|
// If we fail to push, we need to unlock and wait for a short while.
|
|
// The flushing operation needs to update s.imm. Otherwise, we have a deadlock.
|
|
// TODO: Think about how to do this more cleanly, maybe without any locks.
|
|
}
|
|
return false
|
|
}()
|
|
if pushedFlushTask {
|
|
break
|
|
}
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
}
|
|
db.stopMemoryFlush()
|
|
db.stopCompactions()
|
|
|
|
// Force Compact L0
|
|
// We don't need to care about cstatus since no parallel compaction is running.
|
|
if db.opt.CompactL0OnClose {
|
|
err := db.lc.doCompact(compactionPriority{level: 0, score: 1.73})
|
|
switch err {
|
|
case errFillTables:
|
|
// This error only means that there might be enough tables to do a compaction. So, we
|
|
// should not report it to the end user to avoid confusing them.
|
|
case nil:
|
|
db.opt.Infof("Force compaction on level 0 done")
|
|
default:
|
|
db.opt.Warningf("While forcing compaction on level 0: %v", err)
|
|
}
|
|
}
|
|
|
|
if lcErr := db.lc.close(); err == nil {
|
|
err = errors.Wrap(lcErr, "DB.Close")
|
|
}
|
|
db.elog.Printf("Waiting for closer")
|
|
db.closers.updateSize.SignalAndWait()
|
|
db.orc.Stop()
|
|
|
|
db.elog.Finish()
|
|
|
|
if db.dirLockGuard != nil {
|
|
if guardErr := db.dirLockGuard.release(); err == nil {
|
|
err = errors.Wrap(guardErr, "DB.Close")
|
|
}
|
|
}
|
|
if db.valueDirGuard != nil {
|
|
if guardErr := db.valueDirGuard.release(); err == nil {
|
|
err = errors.Wrap(guardErr, "DB.Close")
|
|
}
|
|
}
|
|
if manifestErr := db.manifest.close(); err == nil {
|
|
err = errors.Wrap(manifestErr, "DB.Close")
|
|
}
|
|
|
|
// Fsync directories to ensure that lock file, and any other removed files whose directory
|
|
// we haven't specifically fsynced, are guaranteed to have their directory entry removal
|
|
// persisted to disk.
|
|
if syncErr := syncDir(db.opt.Dir); err == nil {
|
|
err = errors.Wrap(syncErr, "DB.Close")
|
|
}
|
|
if syncErr := syncDir(db.opt.ValueDir); err == nil {
|
|
err = errors.Wrap(syncErr, "DB.Close")
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
const (
|
|
lockFile = "LOCK"
|
|
)
|
|
|
|
// Sync syncs database content to disk. This function provides
|
|
// more control to user to sync data whenever required.
|
|
func (db *DB) Sync() error {
|
|
return db.vlog.sync(math.MaxUint32)
|
|
}
|
|
|
|
// getMemtables returns the current memtables and get references.
|
|
func (db *DB) getMemTables() ([]*skl.Skiplist, func()) {
|
|
db.RLock()
|
|
defer db.RUnlock()
|
|
|
|
tables := make([]*skl.Skiplist, len(db.imm)+1)
|
|
|
|
// Get mutable memtable.
|
|
tables[0] = db.mt
|
|
tables[0].IncrRef()
|
|
|
|
// Get immutable memtables.
|
|
last := len(db.imm) - 1
|
|
for i := range db.imm {
|
|
tables[i+1] = db.imm[last-i]
|
|
tables[i+1].IncrRef()
|
|
}
|
|
return tables, func() {
|
|
for _, tbl := range tables {
|
|
tbl.DecrRef()
|
|
}
|
|
}
|
|
}
|
|
|
|
// get returns the value in memtable or disk for given key.
|
|
// Note that value will include meta byte.
|
|
//
|
|
// IMPORTANT: We should never write an entry with an older timestamp for the same key, We need to
|
|
// maintain this invariant to search for the latest value of a key, or else we need to search in all
|
|
// tables and find the max version among them. To maintain this invariant, we also need to ensure
|
|
// that all versions of a key are always present in the same table from level 1, because compaction
|
|
// can push any table down.
|
|
//
|
|
// Update (Sep 22, 2018): To maintain the above invariant, and to allow keys to be moved from one
|
|
// value log to another (while reclaiming space during value log GC), we have logically moved this
|
|
// need to write "old versions after new versions" to the badgerMove keyspace. Thus, for normal
|
|
// gets, we can stop going down the LSM tree once we find any version of the key (note however that
|
|
// we will ALWAYS skip versions with ts greater than the key version). However, if that key has
|
|
// been moved, then for the corresponding movekey, we'll look through all the levels of the tree
|
|
// to ensure that we pick the highest version of the movekey present.
|
|
func (db *DB) get(key []byte) (y.ValueStruct, error) {
|
|
tables, decr := db.getMemTables() // Lock should be released.
|
|
defer decr()
|
|
|
|
var maxVs *y.ValueStruct
|
|
var version uint64
|
|
if bytes.HasPrefix(key, badgerMove) {
|
|
// If we are checking badgerMove key, we should look into all the
|
|
// levels, so we can pick up the newer versions, which might have been
|
|
// compacted down the tree.
|
|
maxVs = &y.ValueStruct{}
|
|
version = y.ParseTs(key)
|
|
}
|
|
|
|
y.NumGets.Add(1)
|
|
for i := 0; i < len(tables); i++ {
|
|
vs := tables[i].Get(key)
|
|
y.NumMemtableGets.Add(1)
|
|
if vs.Meta == 0 && vs.Value == nil {
|
|
continue
|
|
}
|
|
// Found a version of the key. For user keyspace, return immediately. For move keyspace,
|
|
// continue iterating, unless we found a version == given key version.
|
|
if maxVs == nil || vs.Version == version {
|
|
return vs, nil
|
|
}
|
|
if maxVs.Version < vs.Version {
|
|
*maxVs = vs
|
|
}
|
|
}
|
|
return db.lc.get(key, maxVs)
|
|
}
|
|
|
|
// updateHead should not be called without the db.Lock() since db.vhead is used
|
|
// by the writer go routines and memtable flushing goroutine.
|
|
func (db *DB) updateHead(ptrs []valuePointer) {
|
|
var ptr valuePointer
|
|
for i := len(ptrs) - 1; i >= 0; i-- {
|
|
p := ptrs[i]
|
|
if !p.IsZero() {
|
|
ptr = p
|
|
break
|
|
}
|
|
}
|
|
if ptr.IsZero() {
|
|
return
|
|
}
|
|
|
|
y.AssertTrue(!ptr.Less(db.vhead))
|
|
db.vhead = ptr
|
|
}
|
|
|
|
var requestPool = sync.Pool{
|
|
New: func() interface{} {
|
|
return new(request)
|
|
},
|
|
}
|
|
|
|
func (db *DB) shouldWriteValueToLSM(e Entry) bool {
|
|
return len(e.Value) < db.opt.ValueThreshold
|
|
}
|
|
|
|
func (db *DB) writeToLSM(b *request) error {
|
|
if len(b.Ptrs) != len(b.Entries) {
|
|
return errors.Errorf("Ptrs and Entries don't match: %+v", b)
|
|
}
|
|
|
|
for i, entry := range b.Entries {
|
|
if entry.meta&bitFinTxn != 0 {
|
|
continue
|
|
}
|
|
if db.shouldWriteValueToLSM(*entry) { // Will include deletion / tombstone case.
|
|
db.mt.Put(entry.Key,
|
|
y.ValueStruct{
|
|
Value: entry.Value,
|
|
// Ensure value pointer flag is removed. Otherwise, the value will fail
|
|
// to be retrieved during iterator prefetch. `bitValuePointer` is only
|
|
// known to be set in write to LSM when the entry is loaded from a backup
|
|
// with lower ValueThreshold and its value was stored in the value log.
|
|
Meta: entry.meta &^ bitValuePointer,
|
|
UserMeta: entry.UserMeta,
|
|
ExpiresAt: entry.ExpiresAt,
|
|
})
|
|
} else {
|
|
var offsetBuf [vptrSize]byte
|
|
db.mt.Put(entry.Key,
|
|
y.ValueStruct{
|
|
Value: b.Ptrs[i].Encode(offsetBuf[:]),
|
|
Meta: entry.meta | bitValuePointer,
|
|
UserMeta: entry.UserMeta,
|
|
ExpiresAt: entry.ExpiresAt,
|
|
})
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// writeRequests is called serially by only one goroutine.
|
|
func (db *DB) writeRequests(reqs []*request) error {
|
|
if len(reqs) == 0 {
|
|
return nil
|
|
}
|
|
|
|
done := func(err error) {
|
|
for _, r := range reqs {
|
|
r.Err = err
|
|
r.Wg.Done()
|
|
}
|
|
}
|
|
db.elog.Printf("writeRequests called. Writing to value log")
|
|
|
|
err := db.vlog.write(reqs)
|
|
if err != nil {
|
|
done(err)
|
|
return err
|
|
}
|
|
|
|
db.elog.Printf("Sending updates to subscribers")
|
|
db.pub.sendUpdates(reqs)
|
|
db.elog.Printf("Writing to memtable")
|
|
var count int
|
|
for _, b := range reqs {
|
|
if len(b.Entries) == 0 {
|
|
continue
|
|
}
|
|
count += len(b.Entries)
|
|
var i uint64
|
|
for err = db.ensureRoomForWrite(); err == errNoRoom; err = db.ensureRoomForWrite() {
|
|
i++
|
|
if i%100 == 0 {
|
|
db.elog.Printf("Making room for writes")
|
|
}
|
|
// We need to poll a bit because both hasRoomForWrite and the flusher need access to s.imm.
|
|
// When flushChan is full and you are blocked there, and the flusher is trying to update s.imm,
|
|
// you will get a deadlock.
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
if err != nil {
|
|
done(err)
|
|
return errors.Wrap(err, "writeRequests")
|
|
}
|
|
if err := db.writeToLSM(b); err != nil {
|
|
done(err)
|
|
return errors.Wrap(err, "writeRequests")
|
|
}
|
|
db.Lock()
|
|
db.updateHead(b.Ptrs)
|
|
db.Unlock()
|
|
}
|
|
done(nil)
|
|
db.elog.Printf("%d entries written", count)
|
|
return nil
|
|
}
|
|
|
|
func (db *DB) sendToWriteCh(entries []*Entry) (*request, error) {
|
|
if atomic.LoadInt32(&db.blockWrites) == 1 {
|
|
return nil, ErrBlockedWrites
|
|
}
|
|
var count, size int64
|
|
for _, e := range entries {
|
|
size += int64(e.estimateSize(db.opt.ValueThreshold))
|
|
count++
|
|
}
|
|
if count >= db.opt.maxBatchCount || size >= db.opt.maxBatchSize {
|
|
return nil, ErrTxnTooBig
|
|
}
|
|
|
|
// We can only service one request because we need each txn to be stored in a contigous section.
|
|
// Txns should not interleave among other txns or rewrites.
|
|
req := requestPool.Get().(*request)
|
|
req.reset()
|
|
req.Entries = entries
|
|
req.Wg.Add(1)
|
|
req.IncrRef() // for db write
|
|
db.writeCh <- req // Handled in doWrites.
|
|
y.NumPuts.Add(int64(len(entries)))
|
|
|
|
return req, nil
|
|
}
|
|
|
|
func (db *DB) doWrites(lc *y.Closer) {
|
|
defer lc.Done()
|
|
pendingCh := make(chan struct{}, 1)
|
|
|
|
writeRequests := func(reqs []*request) {
|
|
if err := db.writeRequests(reqs); err != nil {
|
|
db.opt.Errorf("writeRequests: %v", err)
|
|
}
|
|
<-pendingCh
|
|
}
|
|
|
|
// This variable tracks the number of pending writes.
|
|
reqLen := new(expvar.Int)
|
|
y.PendingWrites.Set(db.opt.Dir, reqLen)
|
|
|
|
reqs := make([]*request, 0, 10)
|
|
for {
|
|
var r *request
|
|
select {
|
|
case r = <-db.writeCh:
|
|
case <-lc.HasBeenClosed():
|
|
goto closedCase
|
|
}
|
|
|
|
for {
|
|
reqs = append(reqs, r)
|
|
reqLen.Set(int64(len(reqs)))
|
|
|
|
if len(reqs) >= 3*kvWriteChCapacity {
|
|
pendingCh <- struct{}{} // blocking.
|
|
goto writeCase
|
|
}
|
|
|
|
select {
|
|
// Either push to pending, or continue to pick from writeCh.
|
|
case r = <-db.writeCh:
|
|
case pendingCh <- struct{}{}:
|
|
goto writeCase
|
|
case <-lc.HasBeenClosed():
|
|
goto closedCase
|
|
}
|
|
}
|
|
|
|
closedCase:
|
|
// All the pending request are drained.
|
|
// Don't close the writeCh, because it has be used in several places.
|
|
for {
|
|
select {
|
|
case r = <-db.writeCh:
|
|
reqs = append(reqs, r)
|
|
default:
|
|
pendingCh <- struct{}{} // Push to pending before doing a write.
|
|
writeRequests(reqs)
|
|
return
|
|
}
|
|
}
|
|
|
|
writeCase:
|
|
go writeRequests(reqs)
|
|
reqs = make([]*request, 0, 10)
|
|
reqLen.Set(0)
|
|
}
|
|
}
|
|
|
|
// batchSet applies a list of badger.Entry. If a request level error occurs it
|
|
// will be returned.
|
|
// Check(kv.BatchSet(entries))
|
|
func (db *DB) batchSet(entries []*Entry) error {
|
|
req, err := db.sendToWriteCh(entries)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return req.Wait()
|
|
}
|
|
|
|
// batchSetAsync is the asynchronous version of batchSet. It accepts a callback
|
|
// function which is called when all the sets are complete. If a request level
|
|
// error occurs, it will be passed back via the callback.
|
|
// err := kv.BatchSetAsync(entries, func(err error)) {
|
|
// Check(err)
|
|
// }
|
|
func (db *DB) batchSetAsync(entries []*Entry, f func(error)) error {
|
|
req, err := db.sendToWriteCh(entries)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
go func() {
|
|
err := req.Wait()
|
|
// Write is complete. Let's call the callback function now.
|
|
f(err)
|
|
}()
|
|
return nil
|
|
}
|
|
|
|
var errNoRoom = errors.New("No room for write")
|
|
|
|
// ensureRoomForWrite is always called serially.
|
|
func (db *DB) ensureRoomForWrite() error {
|
|
var err error
|
|
db.Lock()
|
|
defer db.Unlock()
|
|
|
|
// Here we determine if we need to force flush memtable. Given we rotated log file, it would
|
|
// make sense to force flush a memtable, so the updated value head would have a chance to be
|
|
// pushed to L0. Otherwise, it would not go to L0, until the memtable has been fully filled,
|
|
// which can take a lot longer if the write load has fewer keys and larger values. This force
|
|
// flush, thus avoids the need to read through a lot of log files on a crash and restart.
|
|
// Above approach is quite simple with small drawback. We are calling ensureRoomForWrite before
|
|
// inserting every entry in Memtable. We will get latest db.head after all entries for a request
|
|
// are inserted in Memtable. If we have done >= db.logRotates rotations, then while inserting
|
|
// first entry in Memtable, below condition will be true and we will endup flushing old value of
|
|
// db.head. Hence we are limiting no of value log files to be read to db.logRotates only.
|
|
forceFlush := atomic.LoadInt32(&db.logRotates) >= db.opt.LogRotatesToFlush
|
|
|
|
if !forceFlush && db.mt.MemSize() < db.opt.MaxTableSize {
|
|
return nil
|
|
}
|
|
|
|
y.AssertTrue(db.mt != nil) // A nil mt indicates that DB is being closed.
|
|
select {
|
|
case db.flushChan <- flushTask{mt: db.mt, vptr: db.vhead}:
|
|
// After every memtable flush, let's reset the counter.
|
|
atomic.StoreInt32(&db.logRotates, 0)
|
|
|
|
// Ensure value log is synced to disk so this memtable's contents wouldn't be lost.
|
|
err = db.vlog.sync(db.vhead.Fid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
db.opt.Debugf("Flushing memtable, mt.size=%d size of flushChan: %d\n",
|
|
db.mt.MemSize(), len(db.flushChan))
|
|
// We manage to push this task. Let's modify imm.
|
|
db.imm = append(db.imm, db.mt)
|
|
db.mt = skl.NewSkiplist(arenaSize(db.opt))
|
|
// New memtable is empty. We certainly have room.
|
|
return nil
|
|
default:
|
|
// We need to do this to unlock and allow the flusher to modify imm.
|
|
return errNoRoom
|
|
}
|
|
}
|
|
|
|
func arenaSize(opt Options) int64 {
|
|
return opt.MaxTableSize + opt.maxBatchSize + opt.maxBatchCount*int64(skl.MaxNodeSize)
|
|
}
|
|
|
|
// WriteLevel0Table flushes memtable.
|
|
func writeLevel0Table(ft flushTask, f io.Writer) error {
|
|
iter := ft.mt.NewIterator()
|
|
defer iter.Close()
|
|
b := table.NewTableBuilder()
|
|
defer b.Close()
|
|
for iter.SeekToFirst(); iter.Valid(); iter.Next() {
|
|
if len(ft.dropPrefixes) > 0 && hasAnyPrefixes(iter.Key(), ft.dropPrefixes) {
|
|
continue
|
|
}
|
|
b.Add(iter.Key(), iter.Value())
|
|
}
|
|
_, err := f.Write(b.Finish())
|
|
return err
|
|
}
|
|
|
|
type flushTask struct {
|
|
mt *skl.Skiplist
|
|
vptr valuePointer
|
|
dropPrefixes [][]byte
|
|
}
|
|
|
|
func (db *DB) pushHead(ft flushTask) error {
|
|
// Ensure we never push a zero valued head pointer.
|
|
if ft.vptr.IsZero() {
|
|
return errors.New("Head should not be zero")
|
|
}
|
|
|
|
// Store badger head even if vptr is zero, need it for readTs
|
|
db.opt.Debugf("Storing value log head: %+v\n", ft.vptr)
|
|
offset := make([]byte, vptrSize)
|
|
ft.vptr.Encode(offset)
|
|
|
|
// Pick the max commit ts, so in case of crash, our read ts would be higher than all the
|
|
// commits.
|
|
headTs := y.KeyWithTs(head, db.orc.nextTs())
|
|
ft.mt.Put(headTs, y.ValueStruct{Value: offset})
|
|
|
|
return nil
|
|
}
|
|
|
|
// handleFlushTask must be run serially.
|
|
func (db *DB) handleFlushTask(ft flushTask) error {
|
|
// There can be a scenario, when empty memtable is flushed. For example, memtable is empty and
|
|
// after writing request to value log, rotation count exceeds db.LogRotatesToFlush.
|
|
if ft.mt.Empty() {
|
|
return nil
|
|
}
|
|
|
|
if err := db.pushHead(ft); err != nil {
|
|
return err
|
|
}
|
|
|
|
fileID := db.lc.reserveFileID()
|
|
fd, err := y.CreateSyncedFile(table.NewFilename(fileID, db.opt.Dir), true)
|
|
if err != nil {
|
|
return y.Wrap(err)
|
|
}
|
|
|
|
// Don't block just to sync the directory entry.
|
|
dirSyncCh := make(chan error)
|
|
go func() { dirSyncCh <- syncDir(db.opt.Dir) }()
|
|
|
|
err = writeLevel0Table(ft, fd)
|
|
dirSyncErr := <-dirSyncCh
|
|
|
|
if err != nil {
|
|
db.elog.Errorf("ERROR while writing to level 0: %v", err)
|
|
return err
|
|
}
|
|
if dirSyncErr != nil {
|
|
// Do dir sync as best effort. No need to return due to an error there.
|
|
db.elog.Errorf("ERROR while syncing level directory: %v", dirSyncErr)
|
|
}
|
|
|
|
tbl, err := table.OpenTable(fd, db.opt.TableLoadingMode, nil)
|
|
if err != nil {
|
|
db.elog.Printf("ERROR while opening table: %v", err)
|
|
return err
|
|
}
|
|
// We own a ref on tbl.
|
|
err = db.lc.addLevel0Table(tbl) // This will incrRef (if we don't error, sure)
|
|
_ = tbl.DecrRef() // Releases our ref.
|
|
return err
|
|
}
|
|
|
|
// flushMemtable must keep running until we send it an empty flushTask. If there
|
|
// are errors during handling the flush task, we'll retry indefinitely.
|
|
func (db *DB) flushMemtable(lc *y.Closer) error {
|
|
defer lc.Done()
|
|
|
|
for ft := range db.flushChan {
|
|
if ft.mt == nil {
|
|
// We close db.flushChan now, instead of sending a nil ft.mt.
|
|
continue
|
|
}
|
|
for {
|
|
err := db.handleFlushTask(ft)
|
|
if err == nil {
|
|
// Update s.imm. Need a lock.
|
|
db.Lock()
|
|
// This is a single-threaded operation. ft.mt corresponds to the head of
|
|
// db.imm list. Once we flush it, we advance db.imm. The next ft.mt
|
|
// which would arrive here would match db.imm[0], because we acquire a
|
|
// lock over DB when pushing to flushChan.
|
|
// TODO: This logic is dirty AF. Any change and this could easily break.
|
|
y.AssertTrue(ft.mt == db.imm[0])
|
|
db.imm = db.imm[1:]
|
|
ft.mt.DecrRef() // Return memory.
|
|
db.Unlock()
|
|
|
|
break
|
|
}
|
|
// Encountered error. Retry indefinitely.
|
|
db.opt.Errorf("Failure while flushing memtable to disk: %v. Retrying...\n", err)
|
|
time.Sleep(time.Second)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func exists(path string) (bool, error) {
|
|
_, err := os.Stat(path)
|
|
if err == nil {
|
|
return true, nil
|
|
}
|
|
if os.IsNotExist(err) {
|
|
return false, nil
|
|
}
|
|
return true, err
|
|
}
|
|
|
|
// This function does a filewalk, calculates the size of vlog and sst files and stores it in
|
|
// y.LSMSize and y.VlogSize.
|
|
func (db *DB) calculateSize() {
|
|
newInt := func(val int64) *expvar.Int {
|
|
v := new(expvar.Int)
|
|
v.Add(val)
|
|
return v
|
|
}
|
|
|
|
totalSize := func(dir string) (int64, int64) {
|
|
var lsmSize, vlogSize int64
|
|
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ext := filepath.Ext(path)
|
|
if ext == ".sst" {
|
|
lsmSize += info.Size()
|
|
} else if ext == ".vlog" {
|
|
vlogSize += info.Size()
|
|
}
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
db.elog.Printf("Got error while calculating total size of directory: %s", dir)
|
|
}
|
|
return lsmSize, vlogSize
|
|
}
|
|
|
|
lsmSize, vlogSize := totalSize(db.opt.Dir)
|
|
y.LSMSize.Set(db.opt.Dir, newInt(lsmSize))
|
|
// If valueDir is different from dir, we'd have to do another walk.
|
|
if db.opt.ValueDir != db.opt.Dir {
|
|
_, vlogSize = totalSize(db.opt.ValueDir)
|
|
}
|
|
y.VlogSize.Set(db.opt.ValueDir, newInt(vlogSize))
|
|
}
|
|
|
|
func (db *DB) updateSize(lc *y.Closer) {
|
|
defer lc.Done()
|
|
|
|
metricsTicker := time.NewTicker(time.Minute)
|
|
defer metricsTicker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-metricsTicker.C:
|
|
db.calculateSize()
|
|
case <-lc.HasBeenClosed():
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// RunValueLogGC triggers a value log garbage collection.
|
|
//
|
|
// It picks value log files to perform GC based on statistics that are collected
|
|
// during compactions. If no such statistics are available, then log files are
|
|
// picked in random order. The process stops as soon as the first log file is
|
|
// encountered which does not result in garbage collection.
|
|
//
|
|
// When a log file is picked, it is first sampled. If the sample shows that we
|
|
// can discard at least discardRatio space of that file, it would be rewritten.
|
|
//
|
|
// If a call to RunValueLogGC results in no rewrites, then an ErrNoRewrite is
|
|
// thrown indicating that the call resulted in no file rewrites.
|
|
//
|
|
// We recommend setting discardRatio to 0.5, thus indicating that a file be
|
|
// rewritten if half the space can be discarded. This results in a lifetime
|
|
// value log write amplification of 2 (1 from original write + 0.5 rewrite +
|
|
// 0.25 + 0.125 + ... = 2). Setting it to higher value would result in fewer
|
|
// space reclaims, while setting it to a lower value would result in more space
|
|
// reclaims at the cost of increased activity on the LSM tree. discardRatio
|
|
// must be in the range (0.0, 1.0), both endpoints excluded, otherwise an
|
|
// ErrInvalidRequest is returned.
|
|
//
|
|
// Only one GC is allowed at a time. If another value log GC is running, or DB
|
|
// has been closed, this would return an ErrRejected.
|
|
//
|
|
// Note: Every time GC is run, it would produce a spike of activity on the LSM
|
|
// tree.
|
|
func (db *DB) RunValueLogGC(discardRatio float64) error {
|
|
if discardRatio >= 1.0 || discardRatio <= 0.0 {
|
|
return ErrInvalidRequest
|
|
}
|
|
|
|
// Find head on disk
|
|
headKey := y.KeyWithTs(head, math.MaxUint64)
|
|
// Need to pass with timestamp, lsm get removes the last 8 bytes and compares key
|
|
val, err := db.lc.get(headKey, nil)
|
|
if err != nil {
|
|
return errors.Wrap(err, "Retrieving head from on-disk LSM")
|
|
}
|
|
|
|
var head valuePointer
|
|
if len(val.Value) > 0 {
|
|
head.Decode(val.Value)
|
|
}
|
|
|
|
// Pick a log file and run GC
|
|
return db.vlog.runGC(discardRatio, head)
|
|
}
|
|
|
|
// Size returns the size of lsm and value log files in bytes. It can be used to decide how often to
|
|
// call RunValueLogGC.
|
|
func (db *DB) Size() (lsm, vlog int64) {
|
|
if y.LSMSize.Get(db.opt.Dir) == nil {
|
|
lsm, vlog = 0, 0
|
|
return
|
|
}
|
|
lsm = y.LSMSize.Get(db.opt.Dir).(*expvar.Int).Value()
|
|
vlog = y.VlogSize.Get(db.opt.ValueDir).(*expvar.Int).Value()
|
|
return
|
|
}
|
|
|
|
// Sequence represents a Badger sequence.
|
|
type Sequence struct {
|
|
sync.Mutex
|
|
db *DB
|
|
key []byte
|
|
next uint64
|
|
leased uint64
|
|
bandwidth uint64
|
|
}
|
|
|
|
// Next would return the next integer in the sequence, updating the lease by running a transaction
|
|
// if needed.
|
|
func (seq *Sequence) Next() (uint64, error) {
|
|
seq.Lock()
|
|
defer seq.Unlock()
|
|
if seq.next >= seq.leased {
|
|
if err := seq.updateLease(); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
val := seq.next
|
|
seq.next++
|
|
return val, nil
|
|
}
|
|
|
|
// Release the leased sequence to avoid wasted integers. This should be done right
|
|
// before closing the associated DB. However it is valid to use the sequence after
|
|
// it was released, causing a new lease with full bandwidth.
|
|
func (seq *Sequence) Release() error {
|
|
seq.Lock()
|
|
defer seq.Unlock()
|
|
err := seq.db.Update(func(txn *Txn) error {
|
|
item, err := txn.Get(seq.key)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var num uint64
|
|
if err := item.Value(func(v []byte) error {
|
|
num = binary.BigEndian.Uint64(v)
|
|
return nil
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
|
|
if num == seq.leased {
|
|
var buf [8]byte
|
|
binary.BigEndian.PutUint64(buf[:], seq.next)
|
|
return txn.SetEntry(NewEntry(seq.key, buf[:]))
|
|
}
|
|
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
seq.leased = seq.next
|
|
return nil
|
|
}
|
|
|
|
func (seq *Sequence) updateLease() error {
|
|
return seq.db.Update(func(txn *Txn) error {
|
|
item, err := txn.Get(seq.key)
|
|
if err == ErrKeyNotFound {
|
|
seq.next = 0
|
|
} else if err != nil {
|
|
return err
|
|
} else {
|
|
var num uint64
|
|
if err := item.Value(func(v []byte) error {
|
|
num = binary.BigEndian.Uint64(v)
|
|
return nil
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
seq.next = num
|
|
}
|
|
|
|
lease := seq.next + seq.bandwidth
|
|
var buf [8]byte
|
|
binary.BigEndian.PutUint64(buf[:], lease)
|
|
if err = txn.SetEntry(NewEntry(seq.key, buf[:])); err != nil {
|
|
return err
|
|
}
|
|
seq.leased = lease
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// GetSequence would initiate a new sequence object, generating it from the stored lease, if
|
|
// available, in the database. Sequence can be used to get a list of monotonically increasing
|
|
// integers. Multiple sequences can be created by providing different keys. Bandwidth sets the
|
|
// size of the lease, determining how many Next() requests can be served from memory.
|
|
//
|
|
// GetSequence is not supported on ManagedDB. Calling this would result in a panic.
|
|
func (db *DB) GetSequence(key []byte, bandwidth uint64) (*Sequence, error) {
|
|
if db.opt.managedTxns {
|
|
panic("Cannot use GetSequence with managedDB=true.")
|
|
}
|
|
|
|
switch {
|
|
case len(key) == 0:
|
|
return nil, ErrEmptyKey
|
|
case bandwidth == 0:
|
|
return nil, ErrZeroBandwidth
|
|
}
|
|
seq := &Sequence{
|
|
db: db,
|
|
key: key,
|
|
next: 0,
|
|
leased: 0,
|
|
bandwidth: bandwidth,
|
|
}
|
|
err := seq.updateLease()
|
|
return seq, err
|
|
}
|
|
|
|
// Tables gets the TableInfo objects from the level controller. If withKeysCount
|
|
// is true, TableInfo objects also contain counts of keys for the tables.
|
|
func (db *DB) Tables(withKeysCount bool) []TableInfo {
|
|
return db.lc.getTableInfo(withKeysCount)
|
|
}
|
|
|
|
// KeySplits can be used to get rough key ranges to divide up iteration over
|
|
// the DB.
|
|
func (db *DB) KeySplits(prefix []byte) []string {
|
|
var splits []string
|
|
// We just want table ranges here and not keys count.
|
|
for _, ti := range db.Tables(false) {
|
|
// We don't use ti.Left, because that has a tendency to store !badger
|
|
// keys.
|
|
if bytes.HasPrefix(ti.Right, prefix) {
|
|
splits = append(splits, string(ti.Right))
|
|
}
|
|
}
|
|
sort.Strings(splits)
|
|
return splits
|
|
}
|
|
|
|
// MaxBatchCount returns max possible entries in batch
|
|
func (db *DB) MaxBatchCount() int64 {
|
|
return db.opt.maxBatchCount
|
|
}
|
|
|
|
// MaxBatchSize returns max possible batch size
|
|
func (db *DB) MaxBatchSize() int64 {
|
|
return db.opt.maxBatchSize
|
|
}
|
|
|
|
func (db *DB) stopMemoryFlush() {
|
|
// Stop memtable flushes.
|
|
if db.closers.memtable != nil {
|
|
close(db.flushChan)
|
|
db.closers.memtable.SignalAndWait()
|
|
}
|
|
}
|
|
|
|
func (db *DB) stopCompactions() {
|
|
// Stop compactions.
|
|
if db.closers.compactors != nil {
|
|
db.closers.compactors.SignalAndWait()
|
|
}
|
|
}
|
|
|
|
func (db *DB) startCompactions() {
|
|
// Resume compactions.
|
|
if db.closers.compactors != nil {
|
|
db.closers.compactors = y.NewCloser(1)
|
|
db.lc.startCompact(db.closers.compactors)
|
|
}
|
|
}
|
|
|
|
func (db *DB) startMemoryFlush() {
|
|
// Start memory fluhser.
|
|
if db.closers.memtable != nil {
|
|
db.flushChan = make(chan flushTask, db.opt.NumMemtables)
|
|
db.closers.memtable = y.NewCloser(1)
|
|
go func() {
|
|
_ = db.flushMemtable(db.closers.memtable)
|
|
}()
|
|
}
|
|
}
|
|
|
|
// Flatten can be used to force compactions on the LSM tree so all the tables fall on the same
|
|
// level. This ensures that all the versions of keys are colocated and not split across multiple
|
|
// levels, which is necessary after a restore from backup. During Flatten, live compactions are
|
|
// stopped. Ideally, no writes are going on during Flatten. Otherwise, it would create competition
|
|
// between flattening the tree and new tables being created at level zero.
|
|
func (db *DB) Flatten(workers int) error {
|
|
db.stopCompactions()
|
|
defer db.startCompactions()
|
|
|
|
compactAway := func(cp compactionPriority) error {
|
|
db.opt.Infof("Attempting to compact with %+v\n", cp)
|
|
errCh := make(chan error, 1)
|
|
for i := 0; i < workers; i++ {
|
|
go func() {
|
|
errCh <- db.lc.doCompact(cp)
|
|
}()
|
|
}
|
|
var success int
|
|
var rerr error
|
|
for i := 0; i < workers; i++ {
|
|
err := <-errCh
|
|
if err != nil {
|
|
rerr = err
|
|
db.opt.Warningf("While running doCompact with %+v. Error: %v\n", cp, err)
|
|
} else {
|
|
success++
|
|
}
|
|
}
|
|
if success == 0 {
|
|
return rerr
|
|
}
|
|
// We could do at least one successful compaction. So, we'll consider this a success.
|
|
db.opt.Infof("%d compactor(s) succeeded. One or more tables from level %d compacted.\n",
|
|
success, cp.level)
|
|
return nil
|
|
}
|
|
|
|
hbytes := func(sz int64) string {
|
|
return humanize.Bytes(uint64(sz))
|
|
}
|
|
|
|
for {
|
|
db.opt.Infof("\n")
|
|
var levels []int
|
|
for i, l := range db.lc.levels {
|
|
sz := l.getTotalSize()
|
|
db.opt.Infof("Level: %d. %8s Size. %8s Max.\n",
|
|
i, hbytes(l.getTotalSize()), hbytes(l.maxTotalSize))
|
|
if sz > 0 {
|
|
levels = append(levels, i)
|
|
}
|
|
}
|
|
if len(levels) <= 1 {
|
|
prios := db.lc.pickCompactLevels()
|
|
if len(prios) == 0 || prios[0].score <= 1.0 {
|
|
db.opt.Infof("All tables consolidated into one level. Flattening done.\n")
|
|
return nil
|
|
}
|
|
if err := compactAway(prios[0]); err != nil {
|
|
return err
|
|
}
|
|
continue
|
|
}
|
|
// Create an artificial compaction priority, to ensure that we compact the level.
|
|
cp := compactionPriority{level: levels[0], score: 1.71}
|
|
if err := compactAway(cp); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
func (db *DB) blockWrite() {
|
|
// Stop accepting new writes.
|
|
atomic.StoreInt32(&db.blockWrites, 1)
|
|
|
|
// Make all pending writes finish. The following will also close writeCh.
|
|
db.closers.writes.SignalAndWait()
|
|
db.opt.Infof("Writes flushed. Stopping compactions now...")
|
|
}
|
|
|
|
func (db *DB) unblockWrite() {
|
|
db.closers.writes = y.NewCloser(1)
|
|
go db.doWrites(db.closers.writes)
|
|
|
|
// Resume writes.
|
|
atomic.StoreInt32(&db.blockWrites, 0)
|
|
}
|
|
|
|
func (db *DB) prepareToDrop() func() {
|
|
if db.opt.ReadOnly {
|
|
panic("Attempting to drop data in read-only mode.")
|
|
}
|
|
// In order prepare for drop, we need to block the incoming writes and
|
|
// write it to db. Then, flush all the pending flushtask. So that, we
|
|
// don't miss any entries.
|
|
db.blockWrite()
|
|
reqs := make([]*request, 0, 10)
|
|
for {
|
|
select {
|
|
case r := <-db.writeCh:
|
|
reqs = append(reqs, r)
|
|
default:
|
|
if err := db.writeRequests(reqs); err != nil {
|
|
db.opt.Errorf("writeRequests: %v", err)
|
|
}
|
|
db.stopMemoryFlush()
|
|
return func() {
|
|
db.opt.Infof("Resuming writes")
|
|
db.startMemoryFlush()
|
|
db.unblockWrite()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// DropAll would drop all the data stored in Badger. It does this in the following way.
|
|
// - Stop accepting new writes.
|
|
// - Pause memtable flushes and compactions.
|
|
// - Pick all tables from all levels, create a changeset to delete all these
|
|
// tables and apply it to manifest.
|
|
// - Pick all log files from value log, and delete all of them. Restart value log files from zero.
|
|
// - Resume memtable flushes and compactions.
|
|
//
|
|
// NOTE: DropAll is resilient to concurrent writes, but not to reads. It is up to the user to not do
|
|
// any reads while DropAll is going on, otherwise they may result in panics. Ideally, both reads and
|
|
// writes are paused before running DropAll, and resumed after it is finished.
|
|
func (db *DB) DropAll() error {
|
|
f, err := db.dropAll()
|
|
defer f()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (db *DB) dropAll() (func(), error) {
|
|
db.opt.Infof("DropAll called. Blocking writes...")
|
|
f := db.prepareToDrop()
|
|
// prepareToDrop will stop all the incomming write and flushes any pending flush tasks.
|
|
// Before we drop, we'll stop the compaction because anyways all the datas are going to
|
|
// be deleted.
|
|
db.stopCompactions()
|
|
resume := func() {
|
|
db.startCompactions()
|
|
f()
|
|
}
|
|
// Block all foreign interactions with memory tables.
|
|
db.Lock()
|
|
defer db.Unlock()
|
|
|
|
// Remove inmemory tables. Calling DecrRef for safety. Not sure if they're absolutely needed.
|
|
db.mt.DecrRef()
|
|
for _, mt := range db.imm {
|
|
mt.DecrRef()
|
|
}
|
|
db.imm = db.imm[:0]
|
|
db.mt = skl.NewSkiplist(arenaSize(db.opt)) // Set it up for future writes.
|
|
|
|
num, err := db.lc.dropTree()
|
|
if err != nil {
|
|
return resume, err
|
|
}
|
|
db.opt.Infof("Deleted %d SSTables. Now deleting value logs...\n", num)
|
|
|
|
num, err = db.vlog.dropAll()
|
|
if err != nil {
|
|
return resume, err
|
|
}
|
|
db.vhead = valuePointer{} // Zero it out.
|
|
db.lc.nextFileID = 1
|
|
db.opt.Infof("Deleted %d value log files. DropAll done.\n", num)
|
|
return resume, nil
|
|
}
|
|
|
|
// DropPrefix would drop all the keys with the provided prefix. It does this in the following way:
|
|
// - Stop accepting new writes.
|
|
// - Stop memtable flushes before acquiring lock. Because we're acquring lock here
|
|
// and memtable flush stalls for lock, which leads to deadlock
|
|
// - Flush out all memtables, skipping over keys with the given prefix, Kp.
|
|
// - Write out the value log header to memtables when flushing, so we don't accidentally bring Kp
|
|
// back after a restart.
|
|
// - Stop compaction.
|
|
// - Compact L0->L1, skipping over Kp.
|
|
// - Compact rest of the levels, Li->Li, picking tables which have Kp.
|
|
// - Resume memtable flushes, compactions and writes.
|
|
func (db *DB) DropPrefix(prefixes ...[]byte) error {
|
|
db.opt.Infof("DropPrefix Called")
|
|
f := db.prepareToDrop()
|
|
defer f()
|
|
// Block all foreign interactions with memory tables.
|
|
db.Lock()
|
|
defer db.Unlock()
|
|
|
|
db.imm = append(db.imm, db.mt)
|
|
for _, memtable := range db.imm {
|
|
if memtable.Empty() {
|
|
memtable.DecrRef()
|
|
continue
|
|
}
|
|
task := flushTask{
|
|
mt: memtable,
|
|
// Ensure that the head of value log gets persisted to disk.
|
|
vptr: db.vhead,
|
|
dropPrefixes: prefixes,
|
|
}
|
|
db.opt.Debugf("Flushing memtable")
|
|
if err := db.handleFlushTask(task); err != nil {
|
|
db.opt.Errorf("While trying to flush memtable: %v", err)
|
|
return err
|
|
}
|
|
memtable.DecrRef()
|
|
}
|
|
db.stopCompactions()
|
|
defer db.startCompactions()
|
|
db.imm = db.imm[:0]
|
|
db.mt = skl.NewSkiplist(arenaSize(db.opt))
|
|
|
|
// Drop prefixes from the levels.
|
|
if err := db.lc.dropPrefixes(prefixes); err != nil {
|
|
return err
|
|
}
|
|
db.opt.Infof("DropPrefix done")
|
|
return nil
|
|
}
|
|
|
|
// KVList contains a list of key-value pairs.
|
|
type KVList = pb.KVList
|
|
|
|
// Subscribe can be used to watch key changes for the given key prefixes.
|
|
// At least one prefix should be passed, or an error will be returned.
|
|
// You can use an empty prefix to monitor all changes to the DB.
|
|
// This function blocks until the given context is done or an error occurs.
|
|
// The given function will be called with a new KVList containing the modified keys and the
|
|
// corresponding values.
|
|
func (db *DB) Subscribe(ctx context.Context, cb func(kv *KVList) error, prefixes ...[]byte) error {
|
|
if cb == nil {
|
|
return ErrNilCallback
|
|
}
|
|
|
|
c := y.NewCloser(1)
|
|
recvCh, id := db.pub.newSubscriber(c, prefixes...)
|
|
slurp := func(batch *pb.KVList) error {
|
|
for {
|
|
select {
|
|
case kvs := <-recvCh:
|
|
batch.Kv = append(batch.Kv, kvs.Kv...)
|
|
default:
|
|
if len(batch.GetKv()) > 0 {
|
|
return cb(batch)
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
for {
|
|
select {
|
|
case <-c.HasBeenClosed():
|
|
// No need to delete here. Closer will be called only while
|
|
// closing DB. Subscriber will be deleted by cleanSubscribers.
|
|
err := slurp(new(pb.KVList))
|
|
// Drain if any pending updates.
|
|
c.Done()
|
|
return err
|
|
case <-ctx.Done():
|
|
c.Done()
|
|
db.pub.deleteSubscriber(id)
|
|
// Delete the subscriber to avoid further updates.
|
|
return ctx.Err()
|
|
case batch := <-recvCh:
|
|
err := slurp(batch)
|
|
if err != nil {
|
|
c.Done()
|
|
// Delete the subscriber if there is an error by the callback.
|
|
db.pub.deleteSubscriber(id)
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
}
|