Skip to content

Commit 1f91f91

Browse files
authored
Merge pull request #21888 from owen-mc/py/remove-imprecise-container-steps
Python: Remove imprecise container steps #2
2 parents ba8eebe + da999ee commit 1f91f91

32 files changed

Lines changed: 487 additions & 184 deletions

File tree

python/ql/consistency-queries/DataFlowConsistency.ql

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ private module Input implements InputSig<Location, PythonDataFlow> {
3636
// parameter, but dataflow-consistency queries should _not_ complain about there not
3737
// being a post-update node for the synthetic `**kwargs` parameter.
3838
n instanceof SynthDictSplatParameterNode
39+
or
40+
Private::Conversions::readStep(n, _, _)
3941
}
4042

4143
predicate uniqueParameterNodePositionExclude(DataFlowCallable c, ParameterPosition pos, Node p) {
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* Python taint tracking is now more precise for values flowing through container contents, such as list, set, tuple, and dictionary elements. This may remove some false positive alerts.

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll

Lines changed: 97 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,7 @@ predicate jumpStepNotSharedWithTypeTracker(Node nodeFrom, Node nodeTo) {
753753
* As of 2024-04-02 the type-tracking library only supports precise content, so there is
754754
* no reason to include steps for list content right now.
755755
*/
756-
predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
756+
predicate storeStepCommon(Node nodeFrom, Content c, Node nodeTo) {
757757
tupleStoreStep(nodeFrom, c, nodeTo)
758758
or
759759
dictStoreStep(nodeFrom, c, nodeTo)
@@ -767,29 +767,31 @@ predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
767767
* Holds if data can flow from `nodeFrom` to `nodeTo` via an assignment to
768768
* content `c`.
769769
*/
770-
predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
771-
storeStepCommon(nodeFrom, c, nodeTo)
772-
or
773-
listStoreStep(nodeFrom, c, nodeTo)
774-
or
775-
setStoreStep(nodeFrom, c, nodeTo)
776-
or
777-
attributeStoreStep(nodeFrom, c, nodeTo)
778-
or
779-
matchStoreStep(nodeFrom, c, nodeTo)
780-
or
781-
any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo)
770+
predicate storeStep(Node nodeFrom, ContentSet cs, Node nodeTo) {
771+
exists(Content c | cs = singleton(c) |
772+
storeStepCommon(nodeFrom, c, nodeTo)
773+
or
774+
listStoreStep(nodeFrom, c, nodeTo)
775+
or
776+
setStoreStep(nodeFrom, c, nodeTo)
777+
or
778+
attributeStoreStep(nodeFrom, c, nodeTo)
779+
or
780+
matchStoreStep(nodeFrom, c, nodeTo)
781+
or
782+
any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo)
783+
or
784+
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
785+
or
786+
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
787+
or
788+
yieldStoreStep(nodeFrom, c, nodeTo)
789+
or
790+
VariableCapture::storeStep(nodeFrom, c, nodeTo)
791+
)
782792
or
783-
FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c,
793+
FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs,
784794
nodeTo.(FlowSummaryNode).getSummaryNode())
785-
or
786-
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
787-
or
788-
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
789-
or
790-
yieldStoreStep(nodeFrom, c, nodeTo)
791-
or
792-
VariableCapture::storeStep(nodeFrom, c, nodeTo)
793795
}
794796

795797
/**
@@ -985,7 +987,7 @@ predicate attributeStoreStep(Node nodeFrom, AttributeContent c, Node nodeTo) {
985987
/**
986988
* Subset of `readStep` that should be shared with type-tracking.
987989
*/
988-
predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
990+
predicate readStepCommon(Node nodeFrom, Content c, Node nodeTo) {
989991
subscriptReadStep(nodeFrom, c, nodeTo)
990992
or
991993
iterableUnpackingReadStep(nodeFrom, c, nodeTo)
@@ -994,21 +996,25 @@ predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
994996
/**
995997
* Holds if data can flow from `nodeFrom` to `nodeTo` via a read of content `c`.
996998
*/
997-
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
998-
readStepCommon(nodeFrom, c, nodeTo)
999-
or
1000-
matchReadStep(nodeFrom, c, nodeTo)
1001-
or
1002-
forReadStep(nodeFrom, c, nodeTo)
1003-
or
1004-
attributeReadStep(nodeFrom, c, nodeTo)
999+
predicate readStep(Node nodeFrom, ContentSet cs, Node nodeTo) {
1000+
exists(Content c | cs = singleton(c) |
1001+
readStepCommon(nodeFrom, c, nodeTo)
1002+
or
1003+
matchReadStep(nodeFrom, c, nodeTo)
1004+
or
1005+
forReadStep(nodeFrom, c, nodeTo)
1006+
or
1007+
attributeReadStep(nodeFrom, c, nodeTo)
1008+
or
1009+
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
1010+
or
1011+
VariableCapture::readStep(nodeFrom, c, nodeTo)
1012+
)
10051013
or
1006-
FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c,
1014+
FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs,
10071015
nodeTo.(FlowSummaryNode).getSummaryNode())
10081016
or
1009-
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
1010-
or
1011-
VariableCapture::readStep(nodeFrom, c, nodeTo)
1017+
Conversions::readStep(nodeFrom, cs, nodeTo)
10121018
}
10131019

10141020
/** Data flows from a sequence to a subscript of the sequence. */
@@ -1064,23 +1070,68 @@ predicate attributeReadStep(Node nodeFrom, AttributeContent c, AttrRead nodeTo)
10641070
nodeTo.accesses(nodeFrom, c.getAttribute())
10651071
}
10661072

1073+
module Conversions {
1074+
private import semmle.python.Concepts
1075+
1076+
predicate decoderReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
1077+
exists(Decoding decoding |
1078+
nodeFrom = decoding.getAnInput() and
1079+
nodeTo = decoding.getOutput()
1080+
) and
1081+
c.isAnyTupleOrDictionaryElement()
1082+
}
1083+
1084+
predicate encoderReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
1085+
exists(Encoding encoding |
1086+
nodeFrom = encoding.getAnInput() and
1087+
nodeTo = encoding.getOutput()
1088+
) and
1089+
c.isAnyTupleOrDictionaryElement()
1090+
}
1091+
1092+
predicate formatReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
1093+
// % formatting
1094+
exists(BinaryExprNode fmt | fmt = nodeTo.asCfgNode() |
1095+
fmt.getOp() instanceof Mod and
1096+
fmt.getRight() = nodeFrom.asCfgNode()
1097+
) and
1098+
c.isAnyTupleElement()
1099+
or
1100+
// format_map
1101+
// see https://docs.python.org/3/library/stdtypes.html#str.format_map
1102+
nodeTo.(MethodCallNode).calls(_, "format_map") and
1103+
nodeTo.(MethodCallNode).getArg(0) = nodeFrom and
1104+
c.isAnyDictionaryElement()
1105+
}
1106+
1107+
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
1108+
decoderReadStep(nodeFrom, c, nodeTo)
1109+
or
1110+
encoderReadStep(nodeFrom, c, nodeTo)
1111+
or
1112+
formatReadStep(nodeFrom, c, nodeTo)
1113+
}
1114+
}
1115+
10671116
/**
10681117
* Holds if values stored inside content `c` are cleared at node `n`. For example,
10691118
* any value stored inside `f` is cleared at the pre-update node associated with `x`
10701119
* in `x.f = newValue`.
10711120
*/
1072-
predicate clearsContent(Node n, ContentSet c) {
1073-
matchClearStep(n, c)
1074-
or
1075-
attributeClearStep(n, c)
1076-
or
1077-
dictClearStep(n, c)
1078-
or
1079-
FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), c)
1080-
or
1081-
dictSplatParameterNodeClearStep(n, c)
1121+
predicate clearsContent(Node n, ContentSet cs) {
1122+
exists(Content c | cs = singleton(c) |
1123+
matchClearStep(n, c)
1124+
or
1125+
attributeClearStep(n, c)
1126+
or
1127+
dictClearStep(n, c)
1128+
or
1129+
dictSplatParameterNodeClearStep(n, c)
1130+
or
1131+
VariableCapture::clearsContent(n, c)
1132+
)
10821133
or
1083-
VariableCapture::clearsContent(n, c)
1134+
FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), cs)
10841135
}
10851136

10861137
/**

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -898,19 +898,78 @@ class CapturedVariableContent extends Content, TCapturedVariableContent {
898898
override string getMaDRepresentation() { none() }
899899
}
900900

901+
/**
902+
* An entity that represents a set of `Content`s.
903+
*
904+
* Most `ContentSet`s are singletons (i.e. they consist of a single `Content`),
905+
* but `AnyDictionaryElement` and `AnyTupleElement` act as wildcards on the
906+
* read side: a read at such a `ContentSet` matches any specific dictionary
907+
* key / tuple index store, as well as (for dictionaries) the
908+
* "unknown-bucket" Content `DictionaryElementAnyContent`.
909+
*
910+
* Keeping these as wildcard `ContentSet`s (rather than enumerating one
911+
* `ContentSet` per key/index) keeps the dataflow `readSetEx` relation small
912+
* when implicit reads are used (e.g. at sinks via `defaultImplicitTaintRead`).
913+
*/
914+
private newtype TContentSet =
915+
TSingletonContent(Content c) or
916+
TAnyTupleElement() or
917+
TAnyDictionaryElement() or
918+
TAnyTupleOrDictionaryElement()
919+
901920
/**
902921
* An entity that represents a set of `Content`s.
903922
*
904923
* The set may be interpreted differently depending on whether it is
905924
* stored into (`getAStoreContent`) or read from (`getAReadContent`).
906925
*/
907-
class ContentSet instanceof Content {
926+
class ContentSet extends TContentSet {
927+
/** Holds if this content set is the singleton `{c}`. */
928+
predicate isSingleton(Content c) { this = TSingletonContent(c) }
929+
930+
/** Holds if this content set is the wildcard for all tuple elements. */
931+
predicate isAnyTupleElement() { this = TAnyTupleElement() }
932+
933+
/** Holds if this content set is the wildcard for all dictionary elements. */
934+
predicate isAnyDictionaryElement() { this = TAnyDictionaryElement() }
935+
936+
/** Holds if this content set is the wildcard for all tuple elements or dictionary elements. */
937+
predicate isAnyTupleOrDictionaryElement() { this = TAnyTupleOrDictionaryElement() }
938+
908939
/** Gets a content that may be stored into when storing into this set. */
909-
Content getAStoreContent() { result = this }
940+
Content getAStoreContent() { this = TSingletonContent(result) }
910941

911942
/** Gets a content that may be read from when reading from this set. */
912-
Content getAReadContent() { result = this }
943+
Content getAReadContent() {
944+
this = TSingletonContent(result)
945+
or
946+
// Wildcard expansion: a read at "any tuple element" matches a store at any
947+
// specific tuple index. (Stores always target a specific index, so we don't
948+
// need a `TupleElementAnyContent` Content kind here.)
949+
this = TAnyTupleElement() and result instanceof TupleElementContent
950+
or
951+
this = TAnyDictionaryElement() and
952+
(result instanceof DictionaryElementContent or result instanceof DictionaryElementAnyContent)
953+
or
954+
this = TAnyTupleOrDictionaryElement() and
955+
(
956+
result instanceof TupleElementContent or
957+
result instanceof DictionaryElementContent or
958+
result instanceof DictionaryElementAnyContent
959+
)
960+
}
913961

914962
/** Gets a textual representation of this content set. */
915-
string toString() { result = super.toString() }
963+
string toString() {
964+
exists(Content c | this = TSingletonContent(c) | result = c.toString())
965+
or
966+
this = TAnyTupleElement() and result = "Any tuple element"
967+
or
968+
this = TAnyDictionaryElement() and result = "Any dictionary element"
969+
or
970+
this = TAnyTupleOrDictionaryElement() and result = "Any tuple or dictionary element"
971+
}
916972
}
973+
974+
/** Gets the singleton `ContentSet` wrapping the `Content` `c`. */
975+
ContentSet singleton(Content c) { result = TSingletonContent(c) }

python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -66,21 +66,29 @@ module Input implements InputSig<Location, DataFlowImplSpecific::PythonDataFlow>
6666
}
6767

6868
string encodeContent(ContentSet cs, string arg) {
69-
cs = TListElementContent() and result = "ListElement" and arg = ""
70-
or
71-
cs = TSetElementContent() and result = "SetElement" and arg = ""
72-
or
73-
exists(int index |
74-
cs = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString()
69+
exists(Content c | cs.isSingleton(c) |
70+
c = TListElementContent() and result = "ListElement" and arg = ""
71+
or
72+
c = TSetElementContent() and result = "SetElement" and arg = ""
73+
or
74+
exists(int index |
75+
c = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString()
76+
)
77+
or
78+
exists(string key |
79+
c = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key
80+
)
81+
or
82+
c = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = ""
83+
or
84+
exists(string attr | c = TAttributeContent(attr) and result = "Attribute" and arg = attr)
7585
)
7686
or
77-
exists(string key |
78-
cs = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key
79-
)
87+
cs.isAnyTupleElement() and result = "AnyTupleElement" and arg = ""
8088
or
81-
cs = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = ""
89+
cs.isAnyDictionaryElement() and result = "AnyDictionaryElement" and arg = ""
8290
or
83-
exists(string attr | cs = TAttributeContent(attr) and result = "Attribute" and arg = attr)
91+
cs.isAnyTupleOrDictionaryElement() and result = "AnyTupleOrDictionaryElement" and arg = ""
8492
}
8593

8694
bindingset[token]
@@ -139,27 +147,29 @@ module Private {
139147
predicate withContent = SC::withContent/1;
140148

141149
/** Gets a summary component that represents a list element. */
142-
SummaryComponent listElement() { result = content(any(ListElementContent c)) }
150+
SummaryComponent listElement() { result = content(singleton(any(ListElementContent c))) }
143151

144152
/** Gets a summary component that represents a set element. */
145-
SummaryComponent setElement() { result = content(any(SetElementContent c)) }
153+
SummaryComponent setElement() { result = content(singleton(any(SetElementContent c))) }
146154

147155
/** Gets a summary component that represents a tuple element. */
148156
SummaryComponent tupleElement(int index) {
149-
exists(TupleElementContent c | c.getIndex() = index and result = content(c))
157+
exists(TupleElementContent c | c.getIndex() = index and result = content(singleton(c)))
150158
}
151159

152160
/** Gets a summary component that represents a dictionary element. */
153161
SummaryComponent dictionaryElement(string key) {
154-
exists(DictionaryElementContent c | c.getKey() = key and result = content(c))
162+
exists(DictionaryElementContent c | c.getKey() = key and result = content(singleton(c)))
155163
}
156164

157165
/** Gets a summary component that represents a dictionary element at any key. */
158-
SummaryComponent dictionaryElementAny() { result = content(any(DictionaryElementAnyContent c)) }
166+
SummaryComponent dictionaryElementAny() {
167+
result = content(singleton(any(DictionaryElementAnyContent c)))
168+
}
159169

160170
/** Gets a summary component that represents an attribute element. */
161171
SummaryComponent attribute(string attr) {
162-
exists(AttributeContent c | c.getAttribute() = attr and result = content(c))
172+
exists(AttributeContent c | c.getAttribute() = attr and result = content(singleton(c)))
163173
}
164174

165175
/** Gets a summary component that represents the return value of a call. */

0 commit comments

Comments
 (0)