737 lines
20 KiB
Go
737 lines
20 KiB
Go
|
/*
|
||
|
* Copyright 2017 Dgraph Labs, Inc. and Contributors
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
package badger
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
"fmt"
|
||
|
"hash/crc32"
|
||
|
"sort"
|
||
|
"sync"
|
||
|
"sync/atomic"
|
||
|
"time"
|
||
|
|
||
|
"github.com/dgraph-io/badger/options"
|
||
|
"github.com/dgraph-io/badger/table"
|
||
|
|
||
|
"github.com/dgraph-io/badger/y"
|
||
|
)
|
||
|
|
||
|
type prefetchStatus uint8
|
||
|
|
||
|
const (
|
||
|
prefetched prefetchStatus = iota + 1
|
||
|
)
|
||
|
|
||
|
// Item is returned during iteration. Both the Key() and Value() output is only valid until
|
||
|
// iterator.Next() is called.
|
||
|
type Item struct {
|
||
|
status prefetchStatus
|
||
|
err error
|
||
|
wg sync.WaitGroup
|
||
|
db *DB
|
||
|
key []byte
|
||
|
vptr []byte
|
||
|
meta byte // We need to store meta to know about bitValuePointer.
|
||
|
userMeta byte
|
||
|
expiresAt uint64
|
||
|
val []byte
|
||
|
slice *y.Slice // Used only during prefetching.
|
||
|
next *Item
|
||
|
version uint64
|
||
|
txn *Txn
|
||
|
}
|
||
|
|
||
|
// String returns a string representation of Item
|
||
|
func (item *Item) String() string {
|
||
|
return fmt.Sprintf("key=%q, version=%d, meta=%x", item.Key(), item.Version(), item.meta)
|
||
|
}
|
||
|
|
||
|
// Key returns the key.
|
||
|
//
|
||
|
// Key is only valid as long as item is valid, or transaction is valid. If you need to use it
|
||
|
// outside its validity, please use KeyCopy.
|
||
|
func (item *Item) Key() []byte {
|
||
|
return item.key
|
||
|
}
|
||
|
|
||
|
// KeyCopy returns a copy of the key of the item, writing it to dst slice.
|
||
|
// If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and
|
||
|
// returned.
|
||
|
func (item *Item) KeyCopy(dst []byte) []byte {
|
||
|
return y.SafeCopy(dst, item.key)
|
||
|
}
|
||
|
|
||
|
// Version returns the commit timestamp of the item.
|
||
|
func (item *Item) Version() uint64 {
|
||
|
return item.version
|
||
|
}
|
||
|
|
||
|
// Value retrieves the value of the item from the value log.
|
||
|
//
|
||
|
// This method must be called within a transaction. Calling it outside a
|
||
|
// transaction is considered undefined behavior. If an iterator is being used,
|
||
|
// then Item.Value() is defined in the current iteration only, because items are
|
||
|
// reused.
|
||
|
//
|
||
|
// If you need to use a value outside a transaction, please use Item.ValueCopy
|
||
|
// instead, or copy it yourself. Value might change once discard or commit is called.
|
||
|
// Use ValueCopy if you want to do a Set after Get.
|
||
|
func (item *Item) Value(fn func(val []byte) error) error {
|
||
|
item.wg.Wait()
|
||
|
if item.status == prefetched {
|
||
|
if item.err == nil && fn != nil {
|
||
|
if err := fn(item.val); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
return item.err
|
||
|
}
|
||
|
buf, cb, err := item.yieldItemValue()
|
||
|
defer runCallback(cb)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
if fn != nil {
|
||
|
return fn(buf)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// ValueCopy returns a copy of the value of the item from the value log, writing it to dst slice.
|
||
|
// If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and
|
||
|
// returned. Tip: It might make sense to reuse the returned slice as dst argument for the next call.
|
||
|
//
|
||
|
// This function is useful in long running iterate/update transactions to avoid a write deadlock.
|
||
|
// See Github issue: https://github.com/dgraph-io/badger/issues/315
|
||
|
func (item *Item) ValueCopy(dst []byte) ([]byte, error) {
|
||
|
item.wg.Wait()
|
||
|
if item.status == prefetched {
|
||
|
return y.SafeCopy(dst, item.val), item.err
|
||
|
}
|
||
|
buf, cb, err := item.yieldItemValue()
|
||
|
defer runCallback(cb)
|
||
|
return y.SafeCopy(dst, buf), err
|
||
|
}
|
||
|
|
||
|
func (item *Item) hasValue() bool {
|
||
|
if item.meta == 0 && item.vptr == nil {
|
||
|
// key not found
|
||
|
return false
|
||
|
}
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
// IsDeletedOrExpired returns true if item contains deleted or expired value.
|
||
|
func (item *Item) IsDeletedOrExpired() bool {
|
||
|
return isDeletedOrExpired(item.meta, item.expiresAt)
|
||
|
}
|
||
|
|
||
|
// DiscardEarlierVersions returns whether the item was created with the
|
||
|
// option to discard earlier versions of a key when multiple are available.
|
||
|
func (item *Item) DiscardEarlierVersions() bool {
|
||
|
return item.meta&bitDiscardEarlierVersions > 0
|
||
|
}
|
||
|
|
||
|
func (item *Item) yieldItemValue() ([]byte, func(), error) {
|
||
|
key := item.Key() // No need to copy.
|
||
|
for {
|
||
|
if !item.hasValue() {
|
||
|
return nil, nil, nil
|
||
|
}
|
||
|
|
||
|
if item.slice == nil {
|
||
|
item.slice = new(y.Slice)
|
||
|
}
|
||
|
|
||
|
if (item.meta & bitValuePointer) == 0 {
|
||
|
val := item.slice.Resize(len(item.vptr))
|
||
|
copy(val, item.vptr)
|
||
|
return val, nil, nil
|
||
|
}
|
||
|
|
||
|
var vp valuePointer
|
||
|
vp.Decode(item.vptr)
|
||
|
result, cb, err := item.db.vlog.Read(vp, item.slice)
|
||
|
if err != ErrRetry {
|
||
|
return result, cb, err
|
||
|
}
|
||
|
if bytes.HasPrefix(key, badgerMove) {
|
||
|
// err == ErrRetry
|
||
|
// Error is retry even after checking the move keyspace. So, let's
|
||
|
// just assume that value is not present.
|
||
|
return nil, cb, nil
|
||
|
}
|
||
|
|
||
|
// The value pointer is pointing to a deleted value log. Look for the
|
||
|
// move key and read that instead.
|
||
|
runCallback(cb)
|
||
|
// Do not put badgerMove on the left in append. It seems to cause some sort of manipulation.
|
||
|
keyTs := y.KeyWithTs(item.Key(), item.Version())
|
||
|
key = make([]byte, len(badgerMove)+len(keyTs))
|
||
|
n := copy(key, badgerMove)
|
||
|
copy(key[n:], keyTs)
|
||
|
// Note that we can't set item.key to move key, because that would
|
||
|
// change the key user sees before and after this call. Also, this move
|
||
|
// logic is internal logic and should not impact the external behavior
|
||
|
// of the retrieval.
|
||
|
vs, err := item.db.get(key)
|
||
|
if err != nil {
|
||
|
return nil, nil, err
|
||
|
}
|
||
|
if vs.Version != item.Version() {
|
||
|
return nil, nil, nil
|
||
|
}
|
||
|
// Bug fix: Always copy the vs.Value into vptr here. Otherwise, when item is reused this
|
||
|
// slice gets overwritten.
|
||
|
item.vptr = y.SafeCopy(item.vptr, vs.Value)
|
||
|
item.meta &^= bitValuePointer // Clear the value pointer bit.
|
||
|
if vs.Meta&bitValuePointer > 0 {
|
||
|
item.meta |= bitValuePointer // This meta would only be about value pointer.
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func runCallback(cb func()) {
|
||
|
if cb != nil {
|
||
|
cb()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (item *Item) prefetchValue() {
|
||
|
val, cb, err := item.yieldItemValue()
|
||
|
defer runCallback(cb)
|
||
|
|
||
|
item.err = err
|
||
|
item.status = prefetched
|
||
|
if val == nil {
|
||
|
return
|
||
|
}
|
||
|
if item.db.opt.ValueLogLoadingMode == options.MemoryMap {
|
||
|
buf := item.slice.Resize(len(val))
|
||
|
copy(buf, val)
|
||
|
item.val = buf
|
||
|
} else {
|
||
|
item.val = val
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// EstimatedSize returns the approximate size of the key-value pair.
|
||
|
//
|
||
|
// This can be called while iterating through a store to quickly estimate the
|
||
|
// size of a range of key-value pairs (without fetching the corresponding
|
||
|
// values).
|
||
|
func (item *Item) EstimatedSize() int64 {
|
||
|
if !item.hasValue() {
|
||
|
return 0
|
||
|
}
|
||
|
if (item.meta & bitValuePointer) == 0 {
|
||
|
return int64(len(item.key) + len(item.vptr))
|
||
|
}
|
||
|
var vp valuePointer
|
||
|
vp.Decode(item.vptr)
|
||
|
return int64(vp.Len) // includes key length.
|
||
|
}
|
||
|
|
||
|
// KeySize returns the size of the key.
|
||
|
// Exact size of the key is key + 8 bytes of timestamp
|
||
|
func (item *Item) KeySize() int64 {
|
||
|
return int64(len(item.key))
|
||
|
}
|
||
|
|
||
|
// ValueSize returns the exact size of the value.
|
||
|
//
|
||
|
// This can be called to quickly estimate the size of a value without fetching
|
||
|
// it.
|
||
|
func (item *Item) ValueSize() int64 {
|
||
|
if !item.hasValue() {
|
||
|
return 0
|
||
|
}
|
||
|
if (item.meta & bitValuePointer) == 0 {
|
||
|
return int64(len(item.vptr))
|
||
|
}
|
||
|
var vp valuePointer
|
||
|
vp.Decode(item.vptr)
|
||
|
|
||
|
klen := int64(len(item.key) + 8) // 8 bytes for timestamp.
|
||
|
return int64(vp.Len) - klen - headerBufSize - crc32.Size
|
||
|
}
|
||
|
|
||
|
// UserMeta returns the userMeta set by the user. Typically, this byte, optionally set by the user
|
||
|
// is used to interpret the value.
|
||
|
func (item *Item) UserMeta() byte {
|
||
|
return item.userMeta
|
||
|
}
|
||
|
|
||
|
// ExpiresAt returns a Unix time value indicating when the item will be
|
||
|
// considered expired. 0 indicates that the item will never expire.
|
||
|
func (item *Item) ExpiresAt() uint64 {
|
||
|
return item.expiresAt
|
||
|
}
|
||
|
|
||
|
// TODO: Switch this to use linked list container in Go.
|
||
|
type list struct {
|
||
|
head *Item
|
||
|
tail *Item
|
||
|
}
|
||
|
|
||
|
func (l *list) push(i *Item) {
|
||
|
i.next = nil
|
||
|
if l.tail == nil {
|
||
|
l.head = i
|
||
|
l.tail = i
|
||
|
return
|
||
|
}
|
||
|
l.tail.next = i
|
||
|
l.tail = i
|
||
|
}
|
||
|
|
||
|
func (l *list) pop() *Item {
|
||
|
if l.head == nil {
|
||
|
return nil
|
||
|
}
|
||
|
i := l.head
|
||
|
if l.head == l.tail {
|
||
|
l.tail = nil
|
||
|
l.head = nil
|
||
|
} else {
|
||
|
l.head = i.next
|
||
|
}
|
||
|
i.next = nil
|
||
|
return i
|
||
|
}
|
||
|
|
||
|
// IteratorOptions is used to set options when iterating over Badger key-value
|
||
|
// stores.
|
||
|
//
|
||
|
// This package provides DefaultIteratorOptions which contains options that
|
||
|
// should work for most applications. Consider using that as a starting point
|
||
|
// before customizing it for your own needs.
|
||
|
type IteratorOptions struct {
|
||
|
// Indicates whether we should prefetch values during iteration and store them.
|
||
|
PrefetchValues bool
|
||
|
// How many KV pairs to prefetch while iterating. Valid only if PrefetchValues is true.
|
||
|
PrefetchSize int
|
||
|
Reverse bool // Direction of iteration. False is forward, true is backward.
|
||
|
AllVersions bool // Fetch all valid versions of the same key.
|
||
|
|
||
|
// The following option is used to narrow down the SSTables that iterator picks up. If
|
||
|
// Prefix is specified, only tables which could have this prefix are picked based on their range
|
||
|
// of keys.
|
||
|
Prefix []byte // Only iterate over this given prefix.
|
||
|
prefixIsKey bool // If set, use the prefix for bloom filter lookup.
|
||
|
|
||
|
InternalAccess bool // Used to allow internal access to badger keys.
|
||
|
}
|
||
|
|
||
|
func (opt *IteratorOptions) compareToPrefix(key []byte) int {
|
||
|
// We should compare key without timestamp. For example key - a[TS] might be > "aa" prefix.
|
||
|
key = y.ParseKey(key)
|
||
|
if len(key) > len(opt.Prefix) {
|
||
|
key = key[:len(opt.Prefix)]
|
||
|
}
|
||
|
return bytes.Compare(key, opt.Prefix)
|
||
|
}
|
||
|
|
||
|
func (opt *IteratorOptions) pickTable(t table.TableInterface) bool {
|
||
|
if len(opt.Prefix) == 0 {
|
||
|
return true
|
||
|
}
|
||
|
if opt.compareToPrefix(t.Smallest()) > 0 {
|
||
|
return false
|
||
|
}
|
||
|
if opt.compareToPrefix(t.Biggest()) < 0 {
|
||
|
return false
|
||
|
}
|
||
|
// Bloom filter lookup would only work if opt.Prefix does NOT have the read
|
||
|
// timestamp as part of the key.
|
||
|
if opt.prefixIsKey && t.DoesNotHave(opt.Prefix) {
|
||
|
return false
|
||
|
}
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
// pickTables picks the necessary table for the iterator. This function also assumes
|
||
|
// that the tables are sorted in the right order.
|
||
|
func (opt *IteratorOptions) pickTables(all []*table.Table) []*table.Table {
|
||
|
if len(opt.Prefix) == 0 {
|
||
|
out := make([]*table.Table, len(all))
|
||
|
copy(out, all)
|
||
|
return out
|
||
|
}
|
||
|
sIdx := sort.Search(len(all), func(i int) bool {
|
||
|
return opt.compareToPrefix(all[i].Biggest()) >= 0
|
||
|
})
|
||
|
if sIdx == len(all) {
|
||
|
// Not found.
|
||
|
return []*table.Table{}
|
||
|
}
|
||
|
|
||
|
filtered := all[sIdx:]
|
||
|
if !opt.prefixIsKey {
|
||
|
eIdx := sort.Search(len(filtered), func(i int) bool {
|
||
|
return opt.compareToPrefix(filtered[i].Smallest()) > 0
|
||
|
})
|
||
|
out := make([]*table.Table, len(filtered[:eIdx]))
|
||
|
copy(out, filtered[:eIdx])
|
||
|
return out
|
||
|
}
|
||
|
|
||
|
var out []*table.Table
|
||
|
for _, t := range filtered {
|
||
|
// When we encounter the first table whose smallest key is higher than
|
||
|
// opt.Prefix, we can stop.
|
||
|
if opt.compareToPrefix(t.Smallest()) > 0 {
|
||
|
return out
|
||
|
}
|
||
|
// opt.Prefix is actually the key. So, we can run bloom filter checks
|
||
|
// as well.
|
||
|
if t.DoesNotHave(opt.Prefix) {
|
||
|
continue
|
||
|
}
|
||
|
out = append(out, t)
|
||
|
}
|
||
|
return out
|
||
|
}
|
||
|
|
||
|
// DefaultIteratorOptions contains default options when iterating over Badger key-value stores.
|
||
|
var DefaultIteratorOptions = IteratorOptions{
|
||
|
PrefetchValues: true,
|
||
|
PrefetchSize: 100,
|
||
|
Reverse: false,
|
||
|
AllVersions: false,
|
||
|
}
|
||
|
|
||
|
// Iterator helps iterating over the KV pairs in a lexicographically sorted order.
|
||
|
type Iterator struct {
|
||
|
iitr y.Iterator
|
||
|
txn *Txn
|
||
|
readTs uint64
|
||
|
|
||
|
opt IteratorOptions
|
||
|
item *Item
|
||
|
data list
|
||
|
waste list
|
||
|
|
||
|
lastKey []byte // Used to skip over multiple versions of the same key.
|
||
|
|
||
|
closed bool
|
||
|
}
|
||
|
|
||
|
// NewIterator returns a new iterator. Depending upon the options, either only keys, or both
|
||
|
// key-value pairs would be fetched. The keys are returned in lexicographically sorted order.
|
||
|
// Using prefetch is recommended if you're doing a long running iteration, for performance.
|
||
|
//
|
||
|
// Multiple Iterators:
|
||
|
// For a read-only txn, multiple iterators can be running simultaneously. However, for a read-write
|
||
|
// txn, only one can be running at one time to avoid race conditions, because Txn is thread-unsafe.
|
||
|
func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator {
|
||
|
if txn.discarded {
|
||
|
panic("Transaction has already been discarded")
|
||
|
}
|
||
|
// Do not change the order of the next if. We must track the number of running iterators.
|
||
|
if atomic.AddInt32(&txn.numIterators, 1) > 1 && txn.update {
|
||
|
atomic.AddInt32(&txn.numIterators, -1)
|
||
|
panic("Only one iterator can be active at one time, for a RW txn.")
|
||
|
}
|
||
|
|
||
|
// TODO: If Prefix is set, only pick those memtables which have keys with
|
||
|
// the prefix.
|
||
|
tables, decr := txn.db.getMemTables()
|
||
|
defer decr()
|
||
|
txn.db.vlog.incrIteratorCount()
|
||
|
var iters []y.Iterator
|
||
|
if itr := txn.newPendingWritesIterator(opt.Reverse); itr != nil {
|
||
|
iters = append(iters, itr)
|
||
|
}
|
||
|
for i := 0; i < len(tables); i++ {
|
||
|
iters = append(iters, tables[i].NewUniIterator(opt.Reverse))
|
||
|
}
|
||
|
iters = txn.db.lc.appendIterators(iters, &opt) // This will increment references.
|
||
|
|
||
|
res := &Iterator{
|
||
|
txn: txn,
|
||
|
iitr: table.NewMergeIterator(iters, opt.Reverse),
|
||
|
opt: opt,
|
||
|
readTs: txn.readTs,
|
||
|
}
|
||
|
return res
|
||
|
}
|
||
|
|
||
|
// NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a
|
||
|
// single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to
|
||
|
// additionally run bloom filter lookups before picking tables from the LSM tree.
|
||
|
func (txn *Txn) NewKeyIterator(key []byte, opt IteratorOptions) *Iterator {
|
||
|
if len(opt.Prefix) > 0 {
|
||
|
panic("opt.Prefix should be nil for NewKeyIterator.")
|
||
|
}
|
||
|
opt.Prefix = key // This key must be without the timestamp.
|
||
|
opt.prefixIsKey = true
|
||
|
opt.AllVersions = true
|
||
|
return txn.NewIterator(opt)
|
||
|
}
|
||
|
|
||
|
func (it *Iterator) newItem() *Item {
|
||
|
item := it.waste.pop()
|
||
|
if item == nil {
|
||
|
item = &Item{slice: new(y.Slice), db: it.txn.db, txn: it.txn}
|
||
|
}
|
||
|
return item
|
||
|
}
|
||
|
|
||
|
// Item returns pointer to the current key-value pair.
|
||
|
// This item is only valid until it.Next() gets called.
|
||
|
func (it *Iterator) Item() *Item {
|
||
|
tx := it.txn
|
||
|
tx.addReadKey(it.item.Key())
|
||
|
return it.item
|
||
|
}
|
||
|
|
||
|
// Valid returns false when iteration is done.
|
||
|
func (it *Iterator) Valid() bool {
|
||
|
if it.item == nil {
|
||
|
return false
|
||
|
}
|
||
|
if it.opt.prefixIsKey {
|
||
|
return bytes.Equal(it.item.key, it.opt.Prefix)
|
||
|
}
|
||
|
return bytes.HasPrefix(it.item.key, it.opt.Prefix)
|
||
|
}
|
||
|
|
||
|
// ValidForPrefix returns false when iteration is done
|
||
|
// or when the current key is not prefixed by the specified prefix.
|
||
|
func (it *Iterator) ValidForPrefix(prefix []byte) bool {
|
||
|
return it.Valid() && bytes.HasPrefix(it.item.key, prefix)
|
||
|
}
|
||
|
|
||
|
// Close would close the iterator. It is important to call this when you're done with iteration.
|
||
|
func (it *Iterator) Close() {
|
||
|
if it.closed {
|
||
|
return
|
||
|
}
|
||
|
it.closed = true
|
||
|
|
||
|
it.iitr.Close()
|
||
|
// It is important to wait for the fill goroutines to finish. Otherwise, we might leave zombie
|
||
|
// goroutines behind, which are waiting to acquire file read locks after DB has been closed.
|
||
|
waitFor := func(l list) {
|
||
|
item := l.pop()
|
||
|
for item != nil {
|
||
|
item.wg.Wait()
|
||
|
item = l.pop()
|
||
|
}
|
||
|
}
|
||
|
waitFor(it.waste)
|
||
|
waitFor(it.data)
|
||
|
|
||
|
// TODO: We could handle this error.
|
||
|
_ = it.txn.db.vlog.decrIteratorCount()
|
||
|
atomic.AddInt32(&it.txn.numIterators, -1)
|
||
|
}
|
||
|
|
||
|
// Next would advance the iterator by one. Always check it.Valid() after a Next()
|
||
|
// to ensure you have access to a valid it.Item().
|
||
|
func (it *Iterator) Next() {
|
||
|
// Reuse current item
|
||
|
it.item.wg.Wait() // Just cleaner to wait before pushing to avoid doing ref counting.
|
||
|
it.waste.push(it.item)
|
||
|
|
||
|
// Set next item to current
|
||
|
it.item = it.data.pop()
|
||
|
|
||
|
for it.iitr.Valid() {
|
||
|
if it.parseItem() {
|
||
|
// parseItem calls one extra next.
|
||
|
// This is used to deal with the complexity of reverse iteration.
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func isDeletedOrExpired(meta byte, expiresAt uint64) bool {
|
||
|
if meta&bitDelete > 0 {
|
||
|
return true
|
||
|
}
|
||
|
if expiresAt == 0 {
|
||
|
return false
|
||
|
}
|
||
|
return expiresAt <= uint64(time.Now().Unix())
|
||
|
}
|
||
|
|
||
|
// parseItem is a complex function because it needs to handle both forward and reverse iteration
|
||
|
// implementation. We store keys such that their versions are sorted in descending order. This makes
|
||
|
// forward iteration efficient, but revese iteration complicated. This tradeoff is better because
|
||
|
// forward iteration is more common than reverse.
|
||
|
//
|
||
|
// This function advances the iterator.
|
||
|
func (it *Iterator) parseItem() bool {
|
||
|
mi := it.iitr
|
||
|
key := mi.Key()
|
||
|
|
||
|
setItem := func(item *Item) {
|
||
|
if it.item == nil {
|
||
|
it.item = item
|
||
|
} else {
|
||
|
it.data.push(item)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Skip badger keys.
|
||
|
if !it.opt.InternalAccess && bytes.HasPrefix(key, badgerPrefix) {
|
||
|
mi.Next()
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
// Skip any versions which are beyond the readTs.
|
||
|
version := y.ParseTs(key)
|
||
|
if version > it.readTs {
|
||
|
mi.Next()
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
if it.opt.AllVersions {
|
||
|
// Return deleted or expired values also, otherwise user can't figure out
|
||
|
// whether the key was deleted.
|
||
|
item := it.newItem()
|
||
|
it.fill(item)
|
||
|
setItem(item)
|
||
|
mi.Next()
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
// If iterating in forward direction, then just checking the last key against current key would
|
||
|
// be sufficient.
|
||
|
if !it.opt.Reverse {
|
||
|
if y.SameKey(it.lastKey, key) {
|
||
|
mi.Next()
|
||
|
return false
|
||
|
}
|
||
|
// Only track in forward direction.
|
||
|
// We should update lastKey as soon as we find a different key in our snapshot.
|
||
|
// Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a.
|
||
|
// Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5,
|
||
|
// which is wrong. Therefore, update lastKey here.
|
||
|
it.lastKey = y.SafeCopy(it.lastKey, mi.Key())
|
||
|
}
|
||
|
|
||
|
FILL:
|
||
|
// If deleted, advance and return.
|
||
|
vs := mi.Value()
|
||
|
if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
|
||
|
mi.Next()
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
item := it.newItem()
|
||
|
it.fill(item)
|
||
|
// fill item based on current cursor position. All Next calls have returned, so reaching here
|
||
|
// means no Next was called.
|
||
|
|
||
|
mi.Next() // Advance but no fill item yet.
|
||
|
if !it.opt.Reverse || !mi.Valid() { // Forward direction, or invalid.
|
||
|
setItem(item)
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
// Reverse direction.
|
||
|
nextTs := y.ParseTs(mi.Key())
|
||
|
mik := y.ParseKey(mi.Key())
|
||
|
if nextTs <= it.readTs && bytes.Equal(mik, item.key) {
|
||
|
// This is a valid potential candidate.
|
||
|
goto FILL
|
||
|
}
|
||
|
// Ignore the next candidate. Return the current one.
|
||
|
setItem(item)
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
func (it *Iterator) fill(item *Item) {
|
||
|
vs := it.iitr.Value()
|
||
|
item.meta = vs.Meta
|
||
|
item.userMeta = vs.UserMeta
|
||
|
item.expiresAt = vs.ExpiresAt
|
||
|
|
||
|
item.version = y.ParseTs(it.iitr.Key())
|
||
|
item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key()))
|
||
|
|
||
|
item.vptr = y.SafeCopy(item.vptr, vs.Value)
|
||
|
item.val = nil
|
||
|
if it.opt.PrefetchValues {
|
||
|
item.wg.Add(1)
|
||
|
go func() {
|
||
|
// FIXME we are not handling errors here.
|
||
|
item.prefetchValue()
|
||
|
item.wg.Done()
|
||
|
}()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (it *Iterator) prefetch() {
|
||
|
prefetchSize := 2
|
||
|
if it.opt.PrefetchValues && it.opt.PrefetchSize > 1 {
|
||
|
prefetchSize = it.opt.PrefetchSize
|
||
|
}
|
||
|
|
||
|
i := it.iitr
|
||
|
var count int
|
||
|
it.item = nil
|
||
|
for i.Valid() {
|
||
|
if !it.parseItem() {
|
||
|
continue
|
||
|
}
|
||
|
count++
|
||
|
if count == prefetchSize {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Seek would seek to the provided key if present. If absent, it would seek to the next
|
||
|
// smallest key greater than the provided key if iterating in the forward direction.
|
||
|
// Behavior would be reversed if iterating backwards.
|
||
|
func (it *Iterator) Seek(key []byte) {
|
||
|
for i := it.data.pop(); i != nil; i = it.data.pop() {
|
||
|
i.wg.Wait()
|
||
|
it.waste.push(i)
|
||
|
}
|
||
|
|
||
|
it.lastKey = it.lastKey[:0]
|
||
|
if len(key) == 0 {
|
||
|
key = it.opt.Prefix
|
||
|
}
|
||
|
if len(key) == 0 {
|
||
|
it.iitr.Rewind()
|
||
|
it.prefetch()
|
||
|
return
|
||
|
}
|
||
|
|
||
|
if !it.opt.Reverse {
|
||
|
key = y.KeyWithTs(key, it.txn.readTs)
|
||
|
} else {
|
||
|
key = y.KeyWithTs(key, 0)
|
||
|
}
|
||
|
it.iitr.Seek(key)
|
||
|
it.prefetch()
|
||
|
}
|
||
|
|
||
|
// Rewind would rewind the iterator cursor all the way to zero-th position, which would be the
|
||
|
// smallest key if iterating forward, and largest if iterating backward. It does not keep track of
|
||
|
// whether the cursor started with a Seek().
|
||
|
func (it *Iterator) Rewind() {
|
||
|
it.Seek(nil)
|
||
|
}
|